From d0447fb69ffb7de57d8b4c6b3a7c8b6db07a6e13 Mon Sep 17 00:00:00 2001 From: hailin Date: Fri, 20 Feb 2026 00:13:47 -0800 Subject: [PATCH] fix: use node/python HTTP healthchecks instead of wget wget returns error on 404, but services are healthy (just no root endpoint). Using node http.get for NestJS services (accepts any non-5xx response) and python urllib for voice-service. Also upgraded api-gateway depends_on to service_healthy. Co-Authored-By: Claude Opus 4.6 --- deploy/docker/docker-compose.yml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml index 100ada9..5917cd8 100644 --- a/deploy/docker/docker-compose.yml +++ b/deploy/docker/docker-compose.yml @@ -43,19 +43,19 @@ services: - "18001:8001" depends_on: auth-service: - condition: service_started + condition: service_healthy agent-service: - condition: service_started + condition: service_healthy ops-service: - condition: service_started + condition: service_healthy inventory-service: - condition: service_started + condition: service_healthy monitor-service: - condition: service_started + condition: service_healthy comm-service: - condition: service_started + condition: service_healthy audit-service: - condition: service_started + condition: service_healthy healthcheck: test: ["CMD", "kong", "health"] interval: 10s @@ -86,7 +86,7 @@ services: - JWT_REFRESH_SECRET=${JWT_REFRESH_SECRET:-dev-jwt-refresh-secret} - AUTH_SERVICE_PORT=3001 healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3001/api/v1/auth || exit 1"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3001/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] interval: 30s timeout: 5s retries: 3 @@ -120,7 +120,7 @@ services: - AGENT_ENGINE_TYPE=claude_code_cli - AGENT_SERVICE_PORT=3002 healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3002/api/v1/agent || exit 1"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3002/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] interval: 30s timeout: 5s retries: 3 @@ -152,7 +152,7 @@ services: - REDIS_URL=redis://redis:6379 - OPS_SERVICE_PORT=3003 healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3003/api/v1/ops || exit 1"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3003/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] interval: 30s timeout: 5s retries: 3 @@ -185,7 +185,7 @@ services: - VAULT_MASTER_KEY=${VAULT_MASTER_KEY:-dev-vault-key} - INVENTORY_SERVICE_PORT=3004 healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3004/api/v1/inventory || exit 1"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3004/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] interval: 30s timeout: 5s retries: 3 @@ -215,7 +215,7 @@ services: - REDIS_URL=redis://redis:6379 - MONITOR_SERVICE_PORT=3005 healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3005/api/v1/monitor || exit 1"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3005/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] interval: 30s timeout: 5s retries: 3 @@ -248,7 +248,7 @@ services: - TWILIO_PHONE_NUMBER=${TWILIO_PHONE_NUMBER} - COMM_SERVICE_PORT=3006 healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3006/api/v1/comm || exit 1"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3006/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] interval: 30s timeout: 5s retries: 3 @@ -280,7 +280,7 @@ services: - REDIS_URL=redis://redis:6379 - AUDIT_SERVICE_PORT=3007 healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3007/api/v1/audit || exit 1"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3007/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] interval: 30s timeout: 5s retries: 3 @@ -304,7 +304,7 @@ services: - KOKORO_MODEL=${KOKORO_MODEL:-kokoro-82m} - DEVICE=${VOICE_DEVICE:-cpu} healthcheck: - test: ["CMD-SHELL", "curl -sf http://localhost:3008/docs || exit 1"] + test: ["CMD-SHELL", "python3 -c \"import urllib.request; urllib.request.urlopen('http://localhost:3008/docs')\""] interval: 30s timeout: 5s retries: 3 @@ -326,7 +326,7 @@ services: - NEXT_PUBLIC_API_BASE_URL=/api/proxy - NEXT_PUBLIC_WS_URL=ws://localhost:18000 healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3000/ || exit 1"] + test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3000/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] interval: 30s timeout: 5s retries: 3