use crate::routers::RouterTrait; use futures::{StreamExt, TryStreamExt}; use k8s_openapi::api::core::v1::Pod; use kube::{ api::Api, runtime::watcher::{watcher, Config}, runtime::WatchStreamExt, Client, }; use std::collections::{HashMap, HashSet}; use rustls; use std::sync::{Arc, Mutex}; use std::time::Duration; use tokio::task; use tokio::time; use tracing::{debug, error, info, warn}; /// Represents the service discovery configuration #[derive(Debug, Clone)] pub struct ServiceDiscoveryConfig { pub enabled: bool, pub selector: HashMap, pub check_interval: Duration, pub port: u16, pub namespace: Option, // PD mode specific configuration pub pd_mode: bool, pub prefill_selector: HashMap, pub decode_selector: HashMap, // Bootstrap port annotation specific to mooncake implementation pub bootstrap_port_annotation: String, } impl Default for ServiceDiscoveryConfig { fn default() -> Self { ServiceDiscoveryConfig { enabled: false, selector: HashMap::new(), check_interval: Duration::from_secs(60), port: 8000, // Standard port for modern services namespace: None, // None means watch all namespaces pd_mode: false, prefill_selector: HashMap::new(), decode_selector: HashMap::new(), bootstrap_port_annotation: "sglang.ai/bootstrap-port".to_string(), } } } /// Pod type for PD mode service discovery #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum PodType { Prefill, Decode, Regular, } /// Represents a Kubernetes pod's information used for worker management #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct PodInfo { pub name: String, pub ip: String, pub status: String, pub is_ready: bool, pub pod_type: Option, pub bootstrap_port: Option, } impl PodInfo { /// Check if a pod matches any of the given selectors fn matches_selector(pod: &Pod, selector: &HashMap) -> bool { if selector.is_empty() { return false; } pod.metadata .labels .as_ref() .is_some_and(|labels| selector.iter().all(|(k, v)| labels.get(k) == Some(v))) } /// Check if a pod should be included in service discovery pub fn should_include(pod: &Pod, config: &ServiceDiscoveryConfig) -> bool { if config.pd_mode { // In PD mode, at least one selector must be non-empty if config.prefill_selector.is_empty() && config.decode_selector.is_empty() { warn!("PD mode enabled but both prefill_selector and decode_selector are empty"); return false; } // In PD mode, pod must match either prefill or decode selector Self::matches_selector(pod, &config.prefill_selector) || Self::matches_selector(pod, &config.decode_selector) } else { // In regular mode, pod must match the general selector if config.selector.is_empty() { warn!("Regular mode enabled but selector is empty"); return false; } Self::matches_selector(pod, &config.selector) } } /// Unified PodInfo creation with optional PD configuration pub fn from_pod(pod: &Pod, config: Option<&ServiceDiscoveryConfig>) -> Option { let name = pod.metadata.name.clone()?; let status = pod.status.clone()?; let pod_ip = status.pod_ip?; let is_ready = if let Some(conditions) = &status.conditions { conditions .iter() .any(|condition| condition.type_ == "Ready" && condition.status == "True") } else { false }; let pod_status = status.phase.unwrap_or_else(|| "Unknown".to_string()); // Determine pod type based on labels if config is provided and in PD mode let pod_type = if let Some(config) = config { if config.pd_mode { // Use simplified helper methods for cleaner logic if Self::matches_selector(pod, &config.prefill_selector) { Some(PodType::Prefill) } else if Self::matches_selector(pod, &config.decode_selector) { Some(PodType::Decode) } else { Some(PodType::Regular) } } else { Some(PodType::Regular) } } else { // No config provided, default to None (for backwards compatibility) None }; // Extract bootstrap port from annotations for prefill pods let bootstrap_port = if matches!(pod_type, Some(PodType::Prefill)) { if let Some(config) = config { pod.metadata .annotations .as_ref() .and_then(|annotations| annotations.get(&config.bootstrap_port_annotation)) .and_then(|port_str| port_str.parse::().ok()) } else { None } } else { None }; Some(PodInfo { name, ip: pod_ip, status: pod_status, is_ready, pod_type, bootstrap_port, }) } /// Returns true if the pod is in a state where it can accept traffic pub fn is_healthy(&self) -> bool { self.is_ready && self.status == "Running" } /// Generates a worker URL for this pod pub fn worker_url(&self, port: u16) -> String { format!("http://{}:{}", self.ip, port) } } pub async fn start_service_discovery( config: ServiceDiscoveryConfig, router: Arc, ) -> Result, kube::Error> { // Don't initialize anything if service discovery is disabled if !config.enabled { // Return a generic error when service discovery is disabled return Err(kube::Error::Api(kube::error::ErrorResponse { status: "Disabled".to_string(), message: "Service discovery is disabled".to_string(), reason: "ConfigurationError".to_string(), code: 400, })); } let _ = rustls::crypto::ring::default_provider().install_default(); // Initialize Kubernetes client let client = Client::try_default().await?; // Log the appropriate selectors based on mode if config.pd_mode { let prefill_selector = config .prefill_selector .iter() .map(|(k, v)| format!("{}={}", k, v)) .collect::>() .join(","); let decode_selector = config .decode_selector .iter() .map(|(k, v)| format!("{}={}", k, v)) .collect::>() .join(","); info!( "Starting K8s service discovery | PD mode | prefill: '{}' | decode: '{}'", prefill_selector, decode_selector ); } else { let label_selector = config .selector .iter() .map(|(k, v)| format!("{}={}", k, v)) .collect::>() .join(","); info!( "Starting K8s service discovery | selector: '{}'", label_selector ); } // Create the task that will run in the background let handle = task::spawn(async move { // We'll track pods we've already added to avoid duplicates let tracked_pods = Arc::new(Mutex::new(HashSet::new())); // Create a watcher for pods let pods: Api = if let Some(namespace) = &config.namespace { Api::namespaced(client, namespace) } else { Api::all(client) }; debug!("K8s service discovery initialized"); // Create Arcs for configuration data let config_arc = Arc::new(config.clone()); let port = config.port; let mut retry_delay = Duration::from_secs(1); const MAX_RETRY_DELAY: Duration = Duration::from_secs(300); // 5 minutes max loop { // Create a watcher with the proper parameters according to the kube-rs API let watcher_config = Config::default(); let watcher_stream = watcher(pods.clone(), watcher_config).applied_objects(); // Clone Arcs for the closures let config_clone = Arc::clone(&config_arc); let tracked_pods_clone = Arc::clone(&tracked_pods); // Simplified label selector filter using helper method let filtered_stream = watcher_stream.filter_map(move |obj_res| { let config_inner = Arc::clone(&config_clone); async move { match obj_res { Ok(pod) => { if PodInfo::should_include(&pod, &config_inner) { Some(Ok(pod)) } else { None } } Err(e) => Some(Err(e)), } } }); // Clone again for the next closure let tracked_pods_clone2 = Arc::clone(&tracked_pods_clone); let router_clone = Arc::clone(&router); let config_clone2 = Arc::clone(&config_arc); match filtered_stream .try_for_each(move |pod| { let tracked_pods_inner = Arc::clone(&tracked_pods_clone2); let router_inner = Arc::clone(&router_clone); let config_inner = Arc::clone(&config_clone2); async move { let pod_info = PodInfo::from_pod(&pod, Some(&config_inner)); if let Some(pod_info) = pod_info { if pod.metadata.deletion_timestamp.is_some() { handle_pod_deletion( &pod_info, tracked_pods_inner, router_inner, port, config_inner.pd_mode, ) .await; } else { handle_pod_event( &pod_info, tracked_pods_inner, router_inner, port, config_inner.pd_mode, ) .await; } } Ok(()) } }) .await { Ok(_) => { // Reset retry delay on success retry_delay = Duration::from_secs(1); } Err(err) => { error!("Error in Kubernetes watcher: {}", err); warn!( "Retrying in {} seconds with exponential backoff", retry_delay.as_secs() ); time::sleep(retry_delay).await; // Exponential backoff with jitter retry_delay = std::cmp::min(retry_delay * 2, MAX_RETRY_DELAY); } } // If the watcher exits for some reason, wait a bit before restarting warn!( "Kubernetes watcher exited, restarting in {} seconds", config_arc.check_interval.as_secs() ); time::sleep(config_arc.check_interval).await; } }); Ok(handle) } async fn handle_pod_event( pod_info: &PodInfo, tracked_pods: Arc>>, router: Arc, port: u16, pd_mode: bool, ) { let worker_url = pod_info.worker_url(port); // If pod is healthy, try to add it (with atomic check-and-insert) if pod_info.is_healthy() { // Atomic check-and-insert to prevent race conditions let should_add = { let mut tracker = match tracked_pods.lock() { Ok(tracker) => tracker, Err(e) => { error!("Failed to acquire tracked_pods lock: {}", e); return; } }; if tracker.contains(pod_info) { false // Already tracked } else { // Reserve the spot to prevent other threads from adding the same pod tracker.insert(pod_info.clone()); true } }; if should_add { info!( "Adding pod: {} | type: {:?} | url: {}", pod_info.name, pod_info.pod_type, worker_url ); // Handle PD mode with specific pod types let result = if pd_mode && pod_info.pod_type.is_some() { // Need to import PDRouter type use crate::routers::http::pd_router::PDRouter; // Try to downcast to PDRouter if let Some(pd_router) = router.as_any().downcast_ref::() { match &pod_info.pod_type { Some(PodType::Prefill) => pd_router .add_prefill_server(worker_url.clone(), pod_info.bootstrap_port) .await .map_err(|e| e.to_string()), Some(PodType::Decode) => pd_router .add_decode_server(worker_url.clone()) .await .map_err(|e| e.to_string()), Some(PodType::Regular) | None => { // Fall back to regular add_worker for regular pods router.add_worker(&worker_url).await } } } else { Err("PD mode enabled but router is not a PDRouter".to_string()) } } else { // Regular mode or no pod type specified router.add_worker(&worker_url).await }; match result { Ok(_) => { debug!("Worker added: {}", worker_url); } Err(e) => { error!("Failed to add worker {} to router: {}", worker_url, e); // Remove from tracking since addition failed if let Ok(mut tracker) = tracked_pods.lock() { tracker.remove(pod_info); } } } } } } async fn handle_pod_deletion( pod_info: &PodInfo, tracked_pods: Arc>>, router: Arc, port: u16, pd_mode: bool, ) { let worker_url = pod_info.worker_url(port); let was_tracked = { let mut tracked = match tracked_pods.lock() { Ok(tracked) => tracked, Err(e) => { error!("Failed to acquire tracked_pods lock during deletion: {}", e); return; } }; tracked.remove(pod_info) }; if was_tracked { info!( "Removing pod: {} | type: {:?} | url: {}", pod_info.name, pod_info.pod_type, worker_url ); // Handle PD mode removal if pd_mode && pod_info.pod_type.is_some() { use crate::routers::http::pd_router::PDRouter; // Try to downcast to PDRouter for PD-specific removal if let Some(pd_router) = router.as_any().downcast_ref::() { match &pod_info.pod_type { Some(PodType::Prefill) => { if let Err(e) = pd_router.remove_prefill_server(&worker_url).await { error!("Failed to remove prefill server {}: {}", worker_url, e); } } Some(PodType::Decode) => { if let Err(e) = pd_router.remove_decode_server(&worker_url).await { error!("Failed to remove decode server {}: {}", worker_url, e); } } Some(PodType::Regular) | None => { // Fall back to regular remove_worker router.remove_worker(&worker_url); } } } else { // PD mode but not a PDRouter, use generic removal router.remove_worker(&worker_url); } } else { // Regular mode removal router.remove_worker(&worker_url); } } else { // This case might occur if a pod is deleted before it was ever marked healthy and added. // Or if the event is duplicated. No action needed on the router if it wasn't tracked (and thus not added). debug!( "Pod deletion event for untracked/already removed pod: {} (type: {:?}). Worker URL: {}", pod_info.name, pod_info.pod_type, worker_url ); } } #[cfg(test)] mod tests { use super::*; use k8s_openapi::api::core::v1::{Pod, PodCondition, PodSpec, PodStatus}; use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; use k8s_openapi::apimachinery::pkg::apis::meta::v1::Time; // Helper function to create a Pod for testing PodInfo::from_pod fn create_k8s_pod( name: Option<&str>, ip: Option<&str>, phase: Option<&str>, ready_status: Option<&str>, deletion_timestamp: Option