82 lines
2.1 KiB
YAML
82 lines
2.1 KiB
YAML
apiVersion: node.k8s.io/v1
|
|
kind: RuntimeClass
|
|
metadata:
|
|
name: nvidia
|
|
handler: nvidia
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: meta-llama-31-8b-instruct-sglang
|
|
spec:
|
|
replicas: 1
|
|
strategy:
|
|
type: Recreate
|
|
selector:
|
|
matchLabels:
|
|
app: meta-llama-31-8b-instruct-sglang
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: meta-llama-31-8b-instruct-sglang
|
|
model: meta-llama-31-8b-instruct
|
|
engine: sglang
|
|
spec:
|
|
restartPolicy: Always
|
|
runtimeClassName: nvidia
|
|
containers:
|
|
- name: meta-llama-31-8b-instruct-sglang
|
|
image: docker.io/lmsysorg/sglang:latest
|
|
imagePullPolicy: Always # IfNotPresent or Never
|
|
ports:
|
|
- containerPort: 30000
|
|
command: ["python3", "-m", "sglang.launch_server"]
|
|
args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
|
|
env:
|
|
- name: HF_TOKEN
|
|
value: <secret>
|
|
resources:
|
|
limits:
|
|
nvidia.com/gpu: 1
|
|
volumeMounts:
|
|
- name: shm
|
|
mountPath: /dev/shm
|
|
- name: hf-cache
|
|
mountPath: /root/.cache/huggingface
|
|
readOnly: true
|
|
- name: localtime
|
|
mountPath: /etc/localtime
|
|
readOnly: true
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /health
|
|
port: 30000
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 10
|
|
volumes:
|
|
- name: shm
|
|
emptyDir:
|
|
medium: Memory
|
|
sizeLimit: 10Gi
|
|
- name: hf-cache
|
|
hostPath:
|
|
path: /root/.cache/huggingface
|
|
type: Directory
|
|
- name: localtime
|
|
hostPath:
|
|
path: /etc/localtime
|
|
type: File
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: meta-llama-31-8b-instruct-sglang
|
|
spec:
|
|
selector:
|
|
app: meta-llama-31-8b-instruct-sglang
|
|
ports:
|
|
- protocol: TCP
|
|
port: 30000 # port on host
|
|
targetPort: 30000 # port in container
|
|
type: LoadBalancer
|