temporalio · drewhoskins-temporal · Jul 24, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@ Some examples require extra dependencies. See each sample's directory for specif
   * [hello_signal](hello/hello_signal.py) - Send signals to a workflow.
 <!-- Keep this list in alphabetical order -->
 * [activity_worker](activity_worker) - Use Python activities from a workflow in another language.
+* [atomic_message_handlers](updates_and_signals/atomic_message_handlers/) - Safely handling updates and signals.
 * [bedrock](bedrock) - Orchestrate a chatbot with Amazon Bedrock.
 * [cloud_export_to_parquet](cloud_export_to_parquet) - Set up schedule workflow to process exported files on an hourly basis
 * [context_propagation](context_propagation) - Context propagation through workflows/activities via interceptor.

diff --git a/tests/updates_and_signals/atomic_message_handlers_test.py b/tests/updates_and_signals/atomic_message_handlers_test.py
@@ -0,0 +1,74 @@
+import uuid
+
+from temporalio import common, workflow
+from temporalio.client import Client, WorkflowUpdateFailedError
+from temporalio.worker import Worker
+
+from updates_and_signals.atomic_message_handlers.activities import (
+    allocate_nodes_to_job,
+    deallocate_nodes_for_job,
+    find_bad_nodes,
+)
+from updates_and_signals.atomic_message_handlers.starter import do_cluster_lifecycle
+from updates_and_signals.atomic_message_handlers.workflow import (
+    ClusterManagerAllocateNNodesToJobInput,
+    ClusterManagerInput,
+    ClusterManagerWorkflow,
+)
+
+
+async def test_atomic_message_handlers(client: Client):
+    task_queue = f"tq-{uuid.uuid4()}"
+    async with Worker(
+        client,
+        task_queue=task_queue,
+        workflows=[ClusterManagerWorkflow],
+        activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes],
+    ):
+        cluster_manager_handle = await client.start_workflow(
+            ClusterManagerWorkflow.run,
+            ClusterManagerInput(),
+            id=f"ClusterManagerWorkflow-{uuid.uuid4()}",
+            task_queue=task_queue,
+        )
+        await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1)
+        result = await cluster_manager_handle.result()
+        assert result.max_assigned_nodes == 12
+        assert result.num_currently_assigned_nodes == 0
+
+
+async def test_update_failure(client: Client):
+    task_queue = f"tq-{uuid.uuid4()}"
+    async with Worker(
+        client,
+        task_queue=task_queue,
+        workflows=[ClusterManagerWorkflow],
+        activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes],
+    ):
+        cluster_manager_handle = await client.start_workflow(
+            ClusterManagerWorkflow.run,
+            ClusterManagerInput(),
+            id=f"ClusterManagerWorkflow-{uuid.uuid4()}",
+            task_queue=task_queue,
+        )
+
+        await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster)
+
+        await cluster_manager_handle.execute_update(
+            ClusterManagerWorkflow.allocate_n_nodes_to_job,
+            ClusterManagerAllocateNNodesToJobInput(num_nodes=24, task_name=f"big-task"),
+        )
+        try:
+            # Try to allocate too many nodes
+            await cluster_manager_handle.execute_update(
+                ClusterManagerWorkflow.allocate_n_nodes_to_job,
+                ClusterManagerAllocateNNodesToJobInput(
+                    num_nodes=3, task_name=f"little-task"
+                ),
+            )
+        except WorkflowUpdateFailedError as e:
+            assert e.cause.message == "Cannot allocate 3 nodes; have only 1 available"
+        finally:
+            await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster)
+            result = await cluster_manager_handle.result()
+            assert result.num_currently_assigned_nodes == 24
diff --git a/updates_and_signals/atomic_message_handlers/README.md b/updates_and_signals/atomic_message_handlers/README.md
@@ -0,0 +1,20 @@
+# Atomic message handlers
+
+This sample shows off important techniques for handling signals and updates, aka messages.  In particular, it illustrates how message handlers can interleave and how you can manage that.
+
+* Here, using workflow.wait_condition, signal and update handlers will only operate when the workflow is within a certain state--between cluster_started and cluster_shutdown.
+* You can run start_workflow with an initializer signal that you want to run before anything else other than the workflow's constructor.  This pattern is known as "signal-with-start."
+* Message handlers can block and their actions can be interleaved with one another and with the main workflow.  This can easily cause bugs, so we use a lock to protect shared state from interleaved access.
+* Message handlers should also finish before the workflow run completes.  One option is to use a lock.
+* An "Entity" workflow, i.e. a long-lived workflow, periodically "continues as new".  It must do this to prevent its history from growing too large, and it passes its state to the next workflow.  You can check `workflow.info().is_continue_as_new_suggested()` to see when it's time.  Just make sure message handlers have finished before doing so.  
+
+To run, first see [README.md](../../README.md) for prerequisites.
+
+Then, run the following from this directory to run the sample:
+
+```bash
+poetry run python worker.py
+poetry run python starter.py
+```
+
+This will start a worker to run your workflow and activities, then start a ClusterManagerWorkflow and put it through its paces.
diff --git a/updates_and_signals/atomic_message_handlers/__init__.py b/updates_and_signals/atomic_message_handlers/__init__.py
diff --git a/updates_and_signals/atomic_message_handlers/activities.py b/updates_and_signals/atomic_message_handlers/activities.py
@@ -0,0 +1,45 @@
+import asyncio
+from dataclasses import dataclass
+from typing import List
+
+from temporalio import activity
+
+
+@dataclass(kw_only=True)
-@dataclass(kw_only=True)
+@dataclass
-@dataclass(kw_only=True)
+@dataclass
+class AllocateNodesToJobInput:
+    nodes: List[str]
+    task_name: str
+
+
+@activity.defn
+async def allocate_nodes_to_job(input: AllocateNodesToJobInput):
+    print(f"Assigning nodes {input.nodes} to job {input.task_name}")
+    await asyncio.sleep(0.1)
+
+
+@dataclass(kw_only=True)
+class DeallocateNodesForJobInput:
+    nodes: List[str]
+    task_name: str
+
+
+@activity.defn
+async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput):
+    print(f"Deallocating nodes {input.nodes} from job {input.task_name}")
+    await asyncio.sleep(0.1)
+
+
+@dataclass(kw_only=True)
+class FindBadNodesInput:
+    nodes_to_check: List[str]
+
+
+@activity.defn
+async def find_bad_nodes(input: FindBadNodesInput) -> List[str]:
+    await asyncio.sleep(0.1)
+    bad_nodes = [n for n in input.nodes_to_check if int(n) % 5 == 0]
+    if bad_nodes:
+        print(f"Found bad nodes: {bad_nodes}")
+    else:
+        print("No new bad nodes found.")
+    return bad_nodes
diff --git a/updates_and_signals/atomic_message_handlers/starter.py b/updates_and_signals/atomic_message_handlers/starter.py
@@ -0,0 +1,80 @@
+import argparse
+import asyncio
+import logging
+import uuid
+from typing import Optional
+
+from temporalio import client, common
+from temporalio.client import Client, WorkflowHandle
+
+from updates_and_signals.atomic_message_handlers.workflow import (
+    ClusterManagerAllocateNNodesToJobInput,
+    ClusterManagerDeleteJobInput,
+    ClusterManagerInput,
+    ClusterManagerWorkflow,
+)
+
+
+async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] = None):
+
+    await wf.signal(ClusterManagerWorkflow.start_cluster)
+
+    allocation_updates = []
+    for i in range(6):
+        allocation_updates.append(
+            wf.execute_update(
+                ClusterManagerWorkflow.allocate_n_nodes_to_job,
+                ClusterManagerAllocateNNodesToJobInput(
+                    num_nodes=2, task_name=f"task-{i}"
+                ),
+            )
+        )
+    await asyncio.gather(*allocation_updates)
+
+    if delay_seconds:
+        await asyncio.sleep(delay_seconds)
+
+    deletion_updates = []
+    for i in range(6):
+        deletion_updates.append(
+            wf.execute_update(
+                ClusterManagerWorkflow.delete_job,
+                ClusterManagerDeleteJobInput(task_name=f"task-{i}"),
+            )
+        )
+    await asyncio.gather(*deletion_updates)
+
+    await wf.signal(ClusterManagerWorkflow.shutdown_cluster)
+
+
+async def main(should_test_continue_as_new: bool):
+    # Connect to Temporal
+    client = await Client.connect("localhost:7233")
+
+    cluster_manager_handle = await client.start_workflow(
+        ClusterManagerWorkflow.run,
+        ClusterManagerInput(test_continue_as_new=should_test_continue_as_new),
+        id=f"ClusterManagerWorkflow-{uuid.uuid4()}",
+        task_queue="atomic-message-handlers-task-queue",
+        id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING,
+    )
+    delay_seconds = 10 if should_test_continue_as_new else 1
+    await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=delay_seconds)
+    result = await cluster_manager_handle.result()
+    print(
+        f"Cluster shut down successfully.  It peaked at {result.max_assigned_nodes} assigned nodes ."
+        f" It had {result.num_currently_assigned_nodes} nodes assigned at the end."
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser(description="Atomic message handlers")
+    parser.add_argument(
+        "--test-continue-as-new",
+        help="Make the ClusterManagerWorkflow continue as new before shutting down",
+        action="store_true",
+        default=False,
+    )
+    args = parser.parse_args()
+    asyncio.run(main(args.test_continue_as_new))
diff --git a/updates_and_signals/atomic_message_handlers/worker.py b/updates_and_signals/atomic_message_handlers/worker.py
@@ -0,0 +1,41 @@
+import asyncio
+import logging
+
+from temporalio import activity, common, workflow
+from temporalio.client import Client, WorkflowHandle
+from temporalio.worker import Worker
+
+from updates_and_signals.atomic_message_handlers.workflow import (
+    ClusterManagerWorkflow,
+    allocate_nodes_to_job,
+    deallocate_nodes_for_job,
+    find_bad_nodes,
+)
+
+interrupt_event = asyncio.Event()
+
+
+async def main():
+    # Connect client
+    client = await Client.connect("localhost:7233")
+
+    async with Worker(
+        client,
+        task_queue="atomic-message-handlers-task-queue",
+        workflows=[ClusterManagerWorkflow],
+        activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes],
+    ):
+        # Wait until interrupted
+        logging.info("ClusterManagerWorkflow worker started, ctrl+c to exit")
+        await interrupt_event.wait()
+        logging.info("Shutting down")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    loop = asyncio.new_event_loop()
+    try:
+        loop.run_until_complete(main())
+    except KeyboardInterrupt:
+        interrupt_event.set()
+        loop.run_until_complete(loop.shutdown_asyncgens())