triton-inference-server · ryanolson · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[package]
+name = "planner"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+triton-distributed = { path = "../../../runtime/rust" }
+triton-llm = { path = "../../../llm/rust/triton-llm" }
+
+# workspace
+serde = { version = "1", features = ["derive"] }
+serde_json = { version = "1" }
+tokio = { version = "1", features = ["full"] }
+tracing = { version = "0.1" }
+
+# tmp
+# anyhow = { version = "1" }
+# async-stream = { version = "0.3" }
+# async-trait = { version = "0.1" }
+# bytes = "1"
+# derive_builder = "0.20"
+# futures = "0.3"
+# serde = { version = "1", features = ["derive"] }
+# serde_json = { version = "1" }
+# thiserror = { version = "2.0.11" }
+# tokio = { version = "1", features = ["full"] }
+# tokio-stream = { version = "0.1" }
+# tokio-util = { version = "0.7", features = ["codec", "net"] }
+# tracing = { version = "0.1" }
+# validator = { version = "0.20.0", features = ["derive"] }
+# uuid = { version = "1", features = ["v4", "serde"] }
@@ -0,0 +1,113 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Overwatch is a top-level service that monitors the state of a single Nova Init
+//! Deployment.
+//!
+//! Primary responsibilities:
+//! - Monitor each component of the pipeline is marked ready
+//! - For components which are expected to expose an [Endpoint], ensure that at least one
+//!   instance is ready to receive traffic
+//!
+//! - Pipeline state:
+//!   - Setup
+//!   - Ready
+//!   - Unavailable
+//!   - TearDown
+//!
+//! - Actions:
+//!   - Setup:
+//!     - Start all components
+//!     - Observe state from backend/terminus to frontend
+//!       - Order provided by the init-graph
+//!   - Healthy:
+//!     - Customization point
+//!       - the primary action when health for an llm pipeline is to register the pipeline
+//!         as a model name with the http ingress
+//!       - specialiations of this service can perform other actions on transition to healthy
+//!   - Unhealthy:
+//!     - Customization point
+//!       - the primary action when an llm pipeline is unhealthy is to mark the model in the http
+//!         ingress as unhealthy so it can return a 503 Service Unavailable error
+//!       - specialiations of this service can perform other actions on transition to unhealthy
+//!   - TearDown:
+//!     - Stop all components
+//!     - For a Nova Init deployment, the following actions are taken:
+//!       - The model is permanently removed from the http ingress ensuring no new requests are
+//!         forwarded to the pipeline.
+//!       - Each pipeline component is asked to gracefully terminate. This will allow each outstanding
+//!         task/stream to try to complete.
+//!       - Components like routers can be terminated after all frontend are marked to the TearDown
+//!         state, which ensures that no new requests will be accepted by any of the frontend.
+//!       - Any persistent state in ETCD, NATS, MinIO, etc. should be removed.
+//!       - If this process were to die/fail, persistent state might not be removed, which is a problem
+//!         that the global Oscar service will detect and correct.
+//!
+//! Overwatch should be able to write to a special path in ETCD to track each instance regardless of
+//! which namespace it belongs. This reserved path will allow for tools to quickly parse the set of active
+//! pipelines and read their top-level configs and state.
+//!
+
+// TODO - remove after implementation
+#![allow(dead_code)]
+#![allow(unused_imports)]
+#![allow(unused_variables)]
+//
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use tracing as log;
+use trd::{error, logging, DistributedRuntime, ErrorContext, Result, Runtime, Worker};
+use triton_distributed::{self as trd, actions::Action, engine::async_trait};
+use triton_llm::http::service::actions::HttpAction;
+
+fn main() -> Result<()> {
+    logging::init();
+    let worker = Worker::from_settings()?;
+    worker.execute(app)
+}
+
+// TODO - refactor much of this back into the library
+async fn app(runtime: Runtime) -> Result<()> {
+    let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
+    let id = drt.primary_lease().id();
+
+    log::debug!("Overwatch ID: {}", id);
+
+    Ok(())
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct BasicInitGraph {
+    namespace: String,
+
+    /// A map of component name to a list of endpoints
+    /// A [Component] have 0 or more endpoints
+    /// Each [Component] will have a [ServiceState] which will be monitored.
+    /// If a [Component] has [Endpoints][Endpoint], then the list of workers for each [Endpoint]
+    /// will also be monitored.
+    components: HashMap<String, Vec<String>>,
+}
+
+/// Action triggered on Setup
+/// This action will process the InitGraph and perform the coordindated bringup of all component
+/// and endpoints in reverse dependency order, i.e. from the backend to the frontend.
+pub struct InitGraphSetupAction {}
+
+/// Action triggered on Cleanup
+/// This action will process the InitGraph and perform the coordindated tear down of all components
+/// and endpoints.
+///
+/// This action will immediately remove the model from the http ingress.
+pub struct InitGraphCleanupAction {}
@@ -17,10 +17,12 @@ use clap::{Parser, Subcommand};
 use tracing as log;
 
 use triton_distributed::{
-    distributed::DistributedConfig, logging, protocols::Endpoint, raise, DistributedRuntime,
-    Result, Runtime, Worker,
+    distributed::DistributedConfig,
+    logging,
+    protocols::{Endpoint, EndpointAddress},
+    raise, DistributedRuntime, Result, Runtime, Worker,
 };
-use triton_llm::http::service::discovery::ModelEntry;
+use triton_llm::http::service::discovery::{ModelEntry, ModelState};
 
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -54,7 +56,7 @@ enum HttpCommands {
         model_name: String,
 
         /// Endpoint name (format: component.endpoint or namespace.component.endpoint)
-        endpoint_name: String,
+        endpoint_name: EndpointAddress,
     },
 
     /// List chat models
@@ -111,30 +113,10 @@ async fn handle_command(runtime: Runtime, namespace: String, command: Commands)
                         endpoint_name
                     );
 
-                    // parse endpoint
-                    // split by '.' must have 2, can have 3 parts, any more or less is an error
-                    let parts: Vec<&str> = endpoint_name.split('.').collect();
-                    if parts.len() < 2 || parts.len() > 3 {
-                        raise!("Invalid endpoint name: {}", endpoint_name);
-                    }
-
-                    // if 3 parts, then it's namespace.component.endpoint
-                    // if 2 parts, then it's model_name.component.endpoint
-
-                    // create model entry
-                    let endpoint = Endpoint {
-                        namespace: if parts.len() == 3 {
-                            parts[0].to_string()
-                        } else {
-                            namespace.clone()
-                        },
-                        component: parts[parts.len() - 2].to_string(),
-                        name: parts[parts.len() - 1].to_string(),
-                    };
-
                     let model = ModelEntry {
                         name: model_name.clone(),
-                        endpoint,
+                        endpoint: endpoint_name.into(),
+                        state: Some(ModelState::Ready),
                     };
 
                     // add model to etcd
@@ -179,11 +161,12 @@ async fn handle_command(runtime: Runtime, namespace: String, command: Commands)
                             serde_json::from_slice::<ModelEntry>(kv.value()),
                         ) {
                             (Ok(key), Ok(model)) => {
+                                let (n, c, e) = model.endpoint.dissolve();
                                 models.push(ModelRow {
                                     name: key.trim_start_matches(&prefix).to_string(),
-                                    namespace: model.endpoint.namespace,
-                                    component: model.endpoint.component,
-                                    endpoint: model.endpoint.name,
+                                    namespace: n,
+                                    component: c,
+                                    endpoint: e,
                                 });
                             }
                             (Err(e), _) => {

@@ -39,6 +39,7 @@ async-stream = { version = "0.3" }
 async-trait = { version = "0.1" }
 bytes = "1"
 derive_builder = "0.20"
+derive-getters = "0.5"
 futures = "0.3"
 serde = { version = "1", features = ["derive"] }
 thiserror = { version = "2.0.11" }

@@ -48,6 +48,9 @@ validator = { workspace = true }
 uuid = { workspace = true }
 xxhash-rust = { workspace = true }
 
+# actions
+clap = { version = "4.5", features = ["derive"] }
+
 # protocols
 chrono = { version = "0.4" }
 serde_json = { version = "1" }
@@ -67,4 +70,4 @@ mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", rev = "5e6
 insta = { version = "1.41", features = ["glob", "json", "redactions"]}
 proptest = "1.5.0"
 reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "rustls-tls"] }
-rstest = "0.18.2"
+rstest = "0.18.2"
@@ -32,6 +32,7 @@
 
 mod openai;
 
+pub mod actions;
 pub mod discovery;
 pub mod error;
 pub mod metrics;