From 23322028ee5a5590f89557169f093a9edd6b59e8 Mon Sep 17 00:00:00 2001 From: Rahul Gurnani Date: Mon, 15 Dec 2025 23:18:45 +0000 Subject: [PATCH 1/3] Add SGLang deployment --- config/manifests/sglang/gpu-deployment.yaml | 71 +++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 config/manifests/sglang/gpu-deployment.yaml diff --git a/config/manifests/sglang/gpu-deployment.yaml b/config/manifests/sglang/gpu-deployment.yaml new file mode 100644 index 000000000..0996b72ad --- /dev/null +++ b/config/manifests/sglang/gpu-deployment.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sgl-llama3-8b-instruct + labels: + app: sgl-llama3-8b-instruct +spec: + replicas: 3 + selector: + matchLabels: + app: sgl-llama3-8b-instruct + template: + metadata: + labels: + app: sgl-llama3-8b-instruct + spec: + containers: + - name: sglang + image: lmsysorg/sglang:latest + command: ["python3", "-m", "sglang.launch_server"] + args: + - "--model-path=meta-llama/Llama-3.1-8B-Instruct" + - "--host=0.0.0.0" + - "--port=8000" + - "--dtype=bfloat16" + - "--kv-cache-dtype=auto" + - "--tp=1" + - "--mem-fraction-static=0.90" # Equivalent to vllm's gpu-memory-utilization + - "--trust-remote-code" + - "--enable-metrics" + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + optional: true + ports: + - containerPort: 8000 + name: http + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: model-cache + mountPath: /root/.cache/huggingface + - name: dshm + mountPath: /dev/shm + readinessProbe: + httpGet: + path: /health_generate + port: 8000 + periodSeconds: 40 + timeoutSeconds: 30 + startupProbe: + httpGet: + path: /health_generate + port: 8000 + # Give the container 10 minutes (30 * 20s) to download and load weights + failureThreshold: 30 + periodSeconds: 20 + volumes: + - name: model-cache + emptyDir: {} + - name: dshm + emptyDir: + medium: Memory + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" \ No newline at end of file From 7f9c878b54dd53b7262bd86b58dcb4459273ac94 Mon Sep 17 00:00:00 2001 From: Rahul Gurnani Date: Wed, 17 Dec 2025 23:05:05 +0000 Subject: [PATCH 2/3] Update docs --- .../gateway/gke/sglang-httproute.yaml | 1 + site-src/_includes/model-server-cpu.md | 2 +- site-src/_includes/model-server-gpu.md | 2 +- site-src/_includes/model-server-sim.md | 2 +- site-src/_includes/model-server.md | 19 ------------------- site-src/_includes/sglang-gpu.md | 7 +++++++ site-src/guides/index.md | 14 ++++++++++++-- 7 files changed, 23 insertions(+), 24 deletions(-) create mode 100644 config/manifests/gateway/gke/sglang-httproute.yaml delete mode 100644 site-src/_includes/model-server.md create mode 100644 site-src/_includes/sglang-gpu.md diff --git a/config/manifests/gateway/gke/sglang-httproute.yaml b/config/manifests/gateway/gke/sglang-httproute.yaml new file mode 100644 index 000000000..92cac567b --- /dev/null +++ b/config/manifests/gateway/gke/sglang-httproute.yaml @@ -0,0 +1 @@ +# Sample http route for GKE Gateway to route traffic to sglang InferencePool diff --git a/site-src/_includes/model-server-cpu.md b/site-src/_includes/model-server-cpu.md index c2cf00b4d..ef95da70e 100644 --- a/site-src/_includes/model-server-cpu.md +++ b/site-src/_includes/model-server-cpu.md @@ -1,4 +1,4 @@ -=== "CPU-Based Model Server" +=== "CPU-Based vLLM deployment" ???+ warning diff --git a/site-src/_includes/model-server-gpu.md b/site-src/_includes/model-server-gpu.md index f2d6e6bbf..98f42c785 100644 --- a/site-src/_includes/model-server-gpu.md +++ b/site-src/_includes/model-server-gpu.md @@ -1,4 +1,4 @@ -=== "GPU-Based Model Server" +=== "GPU-Based vLLM deployment" For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed. Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). diff --git a/site-src/_includes/model-server-sim.md b/site-src/_includes/model-server-sim.md index 3c1a7a4b8..5b277b49a 100644 --- a/site-src/_includes/model-server-sim.md +++ b/site-src/_includes/model-server-sim.md @@ -1,4 +1,4 @@ -=== "vLLM Simulator Model Server" +=== "vLLM Simulator deployment" This option uses the [vLLM simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) to simulate a backend model server. This setup uses the least amount of compute resources, does not require GPU's, and is ideal for test/dev environments. diff --git a/site-src/_includes/model-server.md b/site-src/_includes/model-server.md deleted file mode 100644 index 47d8e54dc..000000000 --- a/site-src/_includes/model-server.md +++ /dev/null @@ -1,19 +0,0 @@ - Three options are supported for running the model server: - - 1. GPU-based model server. - Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). - - 1. CPU-based model server (not using GPUs). - The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). - - 1. [vLLM Simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) model server (not using GPUs). - The sample is configured to simulate the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model. - - Choose one of these options and follow the steps below. Please do not deploy more than one, as the deployments have the same name and will override each other. - -=== "GPU-Based Model Server" - - For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed. - Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model. - - Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. diff --git a/site-src/_includes/sglang-gpu.md b/site-src/_includes/sglang-gpu.md new file mode 100644 index 000000000..0ad9d3f8e --- /dev/null +++ b/site-src/_includes/sglang-gpu.md @@ -0,0 +1,7 @@ +=== "GPU-Based SGLang deployment" + + For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed. + Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). + Ensure that the token grants access to this model. + + Deploy a sample SGLang deployment with the proper protocol to work with the LLM Instance Gateway. diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 80d65ea26..f823dc20b 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -42,6 +42,12 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/sim-deployment.yaml ``` +--8<-- "site-src/_includes/sglang-gpu.md" + + ```bash + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml + ``` + ### Install the Inference Extension CRDs ```bash @@ -153,11 +159,15 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens inference-gateway inference-gateway True 22s ``` 1. Deploy the HTTPRoute: - + + For vllm deployment: ```bash kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/gateway/gke/httproute.yaml ``` - + For sglang deployment: + ```bash + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/gateway/gke/httproute-sglang.yaml + ``` 1. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: ```bash From 5cc59ae6b32e00e7f0c2f9e62d7a344863f5b13c Mon Sep 17 00:00:00 2001 From: Rahul Gurnani Date: Tue, 23 Dec 2025 03:07:00 +0000 Subject: [PATCH 3/3] Update docs with sglang deployment --- .../gateway/gke/sglang-httproute.yaml | 19 +++++++- site-src/_includes/epp-sglang.md | 43 +++++++++++++++++++ site-src/guides/index.md | 31 ++++++++++--- 3 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 site-src/_includes/epp-sglang.md diff --git a/config/manifests/gateway/gke/sglang-httproute.yaml b/config/manifests/gateway/gke/sglang-httproute.yaml index 92cac567b..e92108069 100644 --- a/config/manifests/gateway/gke/sglang-httproute.yaml +++ b/config/manifests/gateway/gke/sglang-httproute.yaml @@ -1 +1,18 @@ -# Sample http route for GKE Gateway to route traffic to sglang InferencePool +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: sgl-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / diff --git a/site-src/_includes/epp-sglang.md b/site-src/_includes/epp-sglang.md new file mode 100644 index 000000000..ae9d04fca --- /dev/null +++ b/site-src/_includes/epp-sglang.md @@ -0,0 +1,43 @@ +=== "GKE" + + ```bash + export GATEWAY_PROVIDER=gke + helm install sglang-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=sglang-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Istio" + + ```bash + export GATEWAY_PROVIDER=istio + helm install sglang-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=sglang-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Kgateway" + + ```bash + export GATEWAY_PROVIDER=none + helm install sglang-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=sglang-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "NGINX Gateway Fabric" + + ```bash + export GATEWAY_PROVIDER=none + helm install sglang-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=sglang-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` diff --git a/site-src/guides/index.md b/site-src/guides/index.md index f823dc20b..0fa98a393 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -30,6 +30,12 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/gpu-deployment.yaml ``` +--8<-- "site-src/_includes/sglang-gpu.md" + + ```bash + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml + ``` + --8<-- "site-src/_includes/model-server-cpu.md" ```bash @@ -42,12 +48,6 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/sim-deployment.yaml ``` ---8<-- "site-src/_includes/sglang-gpu.md" - - ```bash - kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml - ``` - ### Install the Inference Extension CRDs ```bash @@ -135,6 +135,11 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens --8<-- "site-src/_includes/epp.md" + For sglang deployment: + +--8<-- "site-src/_includes/epp-sglang.md" + + ### Deploy an Inference Gateway Choose one of the following options to deploy an Inference Gateway. @@ -280,6 +285,12 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens kubectl describe inferencepools.inference.networking.k8s.io vllm-llama3-8b-instruct ``` + For sglang deployment: + + ```bash + kubectl describe inferencepools.inference.networking.k8s.io sglang-llama3-8b-instruct + ``` + Check that the status shows Accepted=True and ResolvedRefs=True. This confirms the InferencePool is ready to handle traffic. For more information, see the [NGINX Gateway Fabric - Inference Gateway Setup guide](https://docs.nginx.com/nginx-gateway-fabric/how-to/gateway-api-inference-extension/#overview) @@ -319,6 +330,14 @@ You have now deployed a basic Inference Gateway with a simple routing strategy. kubectl delete secret hf-token --ignore-not-found ``` + For Sglang deployment: + + ```bash + helm uninstall sglang-llama3-8b-instruct + kubectl delete -f kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml --ignore-not-found + kubectl delete secret hf-token --ignore-not-found + ``` + 1. Uninstall the Gateway API Inference Extension CRDs: ```bash