fix: use node/python HTTP healthchecks instead of wget

wget returns error on 404, but services are healthy (just no root
endpoint). Using node http.get for NestJS services (accepts any
non-5xx response) and python urllib for voice-service.

Also upgraded api-gateway depends_on to service_healthy.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-20 00:13:47 -08:00
parent e7ae82e51d
commit d0447fb69f
1 changed files with 16 additions and 16 deletions

View File

@ -43,19 +43,19 @@ services:
- "18001:8001"
depends_on:
auth-service:
condition: service_started
condition: service_healthy
agent-service:
condition: service_started
condition: service_healthy
ops-service:
condition: service_started
condition: service_healthy
inventory-service:
condition: service_started
condition: service_healthy
monitor-service:
condition: service_started
condition: service_healthy
comm-service:
condition: service_started
condition: service_healthy
audit-service:
condition: service_started
condition: service_healthy
healthcheck:
test: ["CMD", "kong", "health"]
interval: 10s
@ -86,7 +86,7 @@ services:
- JWT_REFRESH_SECRET=${JWT_REFRESH_SECRET:-dev-jwt-refresh-secret}
- AUTH_SERVICE_PORT=3001
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3001/api/v1/auth || exit 1"]
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3001/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
interval: 30s
timeout: 5s
retries: 3
@ -120,7 +120,7 @@ services:
- AGENT_ENGINE_TYPE=claude_code_cli
- AGENT_SERVICE_PORT=3002
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3002/api/v1/agent || exit 1"]
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3002/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
interval: 30s
timeout: 5s
retries: 3
@ -152,7 +152,7 @@ services:
- REDIS_URL=redis://redis:6379
- OPS_SERVICE_PORT=3003
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3003/api/v1/ops || exit 1"]
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3003/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
interval: 30s
timeout: 5s
retries: 3
@ -185,7 +185,7 @@ services:
- VAULT_MASTER_KEY=${VAULT_MASTER_KEY:-dev-vault-key}
- INVENTORY_SERVICE_PORT=3004
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3004/api/v1/inventory || exit 1"]
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3004/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
interval: 30s
timeout: 5s
retries: 3
@ -215,7 +215,7 @@ services:
- REDIS_URL=redis://redis:6379
- MONITOR_SERVICE_PORT=3005
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3005/api/v1/monitor || exit 1"]
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3005/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
interval: 30s
timeout: 5s
retries: 3
@ -248,7 +248,7 @@ services:
- TWILIO_PHONE_NUMBER=${TWILIO_PHONE_NUMBER}
- COMM_SERVICE_PORT=3006
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3006/api/v1/comm || exit 1"]
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3006/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
interval: 30s
timeout: 5s
retries: 3
@ -280,7 +280,7 @@ services:
- REDIS_URL=redis://redis:6379
- AUDIT_SERVICE_PORT=3007
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3007/api/v1/audit || exit 1"]
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3007/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
interval: 30s
timeout: 5s
retries: 3
@ -304,7 +304,7 @@ services:
- KOKORO_MODEL=${KOKORO_MODEL:-kokoro-82m}
- DEVICE=${VOICE_DEVICE:-cpu}
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:3008/docs || exit 1"]
test: ["CMD-SHELL", "python3 -c \"import urllib.request; urllib.request.urlopen('http://localhost:3008/docs')\""]
interval: 30s
timeout: 5s
retries: 3
@ -326,7 +326,7 @@ services:
- NEXT_PUBLIC_API_BASE_URL=/api/proxy
- NEXT_PUBLIC_WS_URL=ws://localhost:18000
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/ || exit 1"]
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3000/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
interval: 30s
timeout: 5s
retries: 3