Add SGLang deployment

rahulgurnani · rahulgurnani · commit 234e03c93b4d · 2025-12-15T23:18:45.000Z
diff --git a/config/manifests/sglang/gpu-deployment.yaml b/config/manifests/sglang/gpu-deployment.yaml
@@ -0,0 +1,71 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: sgl-worker
+  labels:
+    app: sgl-worker
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: sgl-worker
+  template:
+    metadata:
+      labels:
+        app: sgl-worker
+    spec:
+      containers:
+        - name: sglang
+          image: lmsysorg/sglang:latest
+          command: ["python3", "-m", "sglang.launch_server"]
+          args:
+            - "--model-path=Qwen/Qwen3-8B"
+            - "--host=0.0.0.0"
+            - "--port=8000"
+            - "--dtype=bfloat16"
+            - "--kv-cache-dtype=auto"
+            - "--tp=1"
+            - "--mem-fraction-static=0.90" # Equivalent to vllm's gpu-memory-utilization
+            - "--trust-remote-code"
+            - "--enable-metrics"
+          env:
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+                  optional: true
+          ports:
+            - containerPort: 8000
+              name: http
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - name: model-cache
+              mountPath: /root/.cache/huggingface
+            - name: dshm
+              mountPath: /dev/shm
+          readinessProbe:
+            httpGet:
+              path: /health_generate
+              port: 8000
+            periodSeconds: 40
+            timeoutSeconds: 30
+          startupProbe:
+            httpGet:
+              path: /health_generate
+              port: 8000
+            # Give the container 10 minutes (30 * 20s) to download and load weights
+            failureThreshold: 30
+            periodSeconds: 20
+      volumes:
+        - name: model-cache
+          emptyDir: {}
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"