File tree Expand file tree Collapse file tree 1 file changed +71
-0
lines changed
Expand file tree Collapse file tree 1 file changed +71
-0
lines changed Original file line number Diff line number Diff line change 1+ apiVersion : apps/v1
2+ kind : Deployment
3+ metadata :
4+ name : sgl-worker
5+ labels :
6+ app : sgl-worker
7+ spec :
8+ replicas : 3
9+ selector :
10+ matchLabels :
11+ app : sgl-worker
12+ template :
13+ metadata :
14+ labels :
15+ app : sgl-worker
16+ spec :
17+ containers :
18+ - name : sglang
19+ image : lmsysorg/sglang:latest
20+ command : ["python3", "-m", "sglang.launch_server"]
21+ args :
22+ - " --model-path=Qwen/Qwen3-8B"
23+ - " --host=0.0.0.0"
24+ - " --port=8000"
25+ - " --dtype=bfloat16"
26+ - " --kv-cache-dtype=auto"
27+ - " --tp=1"
28+ - " --mem-fraction-static=0.90" # Equivalent to vllm's gpu-memory-utilization
29+ - " --trust-remote-code"
30+ - " --enable-metrics"
31+ env :
32+ - name : HF_TOKEN
33+ valueFrom :
34+ secretKeyRef :
35+ name : hf-token
36+ key : token
37+ optional : true
38+ ports :
39+ - containerPort : 8000
40+ name : http
41+ resources :
42+ limits :
43+ nvidia.com/gpu : 1
44+ volumeMounts :
45+ - name : model-cache
46+ mountPath : /root/.cache/huggingface
47+ - name : dshm
48+ mountPath : /dev/shm
49+ readinessProbe :
50+ httpGet :
51+ path : /health_generate
52+ port : 8000
53+ periodSeconds : 40
54+ timeoutSeconds : 30
55+ startupProbe :
56+ httpGet :
57+ path : /health_generate
58+ port : 8000
59+ # Give the container 10 minutes (30 * 20s) to download and load weights
60+ failureThreshold : 30
61+ periodSeconds : 20
62+ volumes :
63+ - name : model-cache
64+ emptyDir : {}
65+ - name : dshm
66+ emptyDir :
67+ medium : Memory
68+ tolerations :
69+ - key : " nvidia.com/gpu"
70+ operator : " Exists"
71+ effect : " NoSchedule"
You can’t perform that action at this time.
0 commit comments