From 23322028ee5a5590f89557169f093a9edd6b59e8 Mon Sep 17 00:00:00 2001
From: Rahul Gurnani <rahulgurnani@google.com>
Date: Mon, 15 Dec 2025 23:18:45 +0000
Subject: [PATCH 1/3] Add SGLang deployment

---
 config/manifests/sglang/gpu-deployment.yaml | 71 +++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 config/manifests/sglang/gpu-deployment.yaml

diff --git a/config/manifests/sglang/gpu-deployment.yaml b/config/manifests/sglang/gpu-deployment.yaml
new file mode 100644
index 000000000..0996b72ad
--- /dev/null
+++ b/config/manifests/sglang/gpu-deployment.yaml
@@ -0,0 +1,71 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: sgl-llama3-8b-instruct
+  labels:
+    app: sgl-llama3-8b-instruct
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: sgl-llama3-8b-instruct
+  template:
+    metadata:
+      labels:
+        app: sgl-llama3-8b-instruct
+    spec:
+      containers:
+        - name: sglang
+          image: lmsysorg/sglang:latest
+          command: ["python3", "-m", "sglang.launch_server"]
+          args:
+            - "--model-path=meta-llama/Llama-3.1-8B-Instruct"
+            - "--host=0.0.0.0"
+            - "--port=8000"
+            - "--dtype=bfloat16"
+            - "--kv-cache-dtype=auto"
+            - "--tp=1"
+            - "--mem-fraction-static=0.90" # Equivalent to vllm's gpu-memory-utilization
+            - "--trust-remote-code"
+            - "--enable-metrics"
+          env:
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+                  optional: true
+          ports:
+            - containerPort: 8000
+              name: http
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - name: model-cache
+              mountPath: /root/.cache/huggingface
+            - name: dshm
+              mountPath: /dev/shm
+          readinessProbe:
+            httpGet:
+              path: /health_generate
+              port: 8000
+            periodSeconds: 40
+            timeoutSeconds: 30
+          startupProbe:
+            httpGet:
+              path: /health_generate
+              port: 8000
+            # Give the container 10 minutes (30 * 20s) to download and load weights
+            failureThreshold: 30
+            periodSeconds: 20
+      volumes:
+        - name: model-cache
+          emptyDir: {}
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
\ No newline at end of file

From 7f9c878b54dd53b7262bd86b58dcb4459273ac94 Mon Sep 17 00:00:00 2001
From: Rahul Gurnani <rahulgurnani@google.com>
Date: Wed, 17 Dec 2025 23:05:05 +0000
Subject: [PATCH 2/3] Update docs

---
 .../gateway/gke/sglang-httproute.yaml         |  1 +
 site-src/_includes/model-server-cpu.md        |  2 +-
 site-src/_includes/model-server-gpu.md        |  2 +-
 site-src/_includes/model-server-sim.md        |  2 +-
 site-src/_includes/model-server.md            | 19 -------------------
 site-src/_includes/sglang-gpu.md              |  7 +++++++
 site-src/guides/index.md                      | 14 ++++++++++++--
 7 files changed, 23 insertions(+), 24 deletions(-)
 create mode 100644 config/manifests/gateway/gke/sglang-httproute.yaml
 delete mode 100644 site-src/_includes/model-server.md
 create mode 100644 site-src/_includes/sglang-gpu.md

diff --git a/config/manifests/gateway/gke/sglang-httproute.yaml b/config/manifests/gateway/gke/sglang-httproute.yaml
new file mode 100644
index 000000000..92cac567b
--- /dev/null
+++ b/config/manifests/gateway/gke/sglang-httproute.yaml
@@ -0,0 +1 @@
+# Sample http route for GKE Gateway to route traffic to sglang InferencePool
diff --git a/site-src/_includes/model-server-cpu.md b/site-src/_includes/model-server-cpu.md
index c2cf00b4d..ef95da70e 100644
--- a/site-src/_includes/model-server-cpu.md
+++ b/site-src/_includes/model-server-cpu.md
@@ -1,4 +1,4 @@
-=== "CPU-Based Model Server"
+=== "CPU-Based vLLM deployment"
 
     ???+ warning
 
diff --git a/site-src/_includes/model-server-gpu.md b/site-src/_includes/model-server-gpu.md
index f2d6e6bbf..98f42c785 100644
--- a/site-src/_includes/model-server-gpu.md
+++ b/site-src/_includes/model-server-gpu.md
@@ -1,4 +1,4 @@
-=== "GPU-Based Model Server"
+=== "GPU-Based vLLM deployment"
 
     For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed.
     Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
diff --git a/site-src/_includes/model-server-sim.md b/site-src/_includes/model-server-sim.md
index 3c1a7a4b8..5b277b49a 100644
--- a/site-src/_includes/model-server-sim.md
+++ b/site-src/_includes/model-server-sim.md
@@ -1,4 +1,4 @@
-=== "vLLM Simulator Model Server"
+=== "vLLM Simulator deployment"
 
     This option uses the [vLLM simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) to simulate a backend model server.
     This setup uses the least amount of compute resources, does not require GPU's, and is ideal for test/dev environments.
diff --git a/site-src/_includes/model-server.md b/site-src/_includes/model-server.md
deleted file mode 100644
index 47d8e54dc..000000000
--- a/site-src/_includes/model-server.md
+++ /dev/null
@@ -1,19 +0,0 @@
-   Three options are supported for running the model server:
-
-   1. GPU-based model server.
-      Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
-
-   1. CPU-based model server (not using GPUs).
-      The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
-
-   1. [vLLM Simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) model server (not using GPUs).
-      The sample is configured to simulate the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model.
-
-   Choose one of these options and follow the steps below. Please do not deploy more than one, as the deployments have the same name and will override each other.
-
-=== "GPU-Based Model Server"
-
-    For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed.
-    Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model.
-
-    Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
diff --git a/site-src/_includes/sglang-gpu.md b/site-src/_includes/sglang-gpu.md
new file mode 100644
index 000000000..0ad9d3f8e
--- /dev/null
+++ b/site-src/_includes/sglang-gpu.md
@@ -0,0 +1,7 @@
+=== "GPU-Based SGLang deployment"
+
+    For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed.
+    Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+    Ensure that the token grants access to this model.
+
+    Deploy a sample SGLang deployment with the proper protocol to work with the LLM Instance Gateway.
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
index 80d65ea26..f823dc20b 100644
--- a/site-src/guides/index.md
+++ b/site-src/guides/index.md
@@ -42,6 +42,12 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa
     kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/sim-deployment.yaml
     ```
 
+--8<-- "site-src/_includes/sglang-gpu.md"
+
+   ```bash
+   kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml
+   ```
+
 ### Install the Inference Extension CRDs
 
 ```bash
@@ -153,11 +159,15 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens
          inference-gateway   inference-gateway   <MY_ADDRESS>    True         22s
          ```
       1. Deploy the HTTPRoute:
-
+      
+         For vllm deployment:
          ```bash
          kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/gateway/gke/httproute.yaml
          ```
-
+         For sglang deployment:
+         ```bash
+         kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/gateway/gke/httproute-sglang.yaml
+         ```
       1. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`:
 
          ```bash

From 5cc59ae6b32e00e7f0c2f9e62d7a344863f5b13c Mon Sep 17 00:00:00 2001
From: Rahul Gurnani <rahulgurnani@google.com>
Date: Tue, 23 Dec 2025 03:07:00 +0000
Subject: [PATCH 3/3] Update docs with sglang deployment

---
 .../gateway/gke/sglang-httproute.yaml         | 19 +++++++-
 site-src/_includes/epp-sglang.md              | 43 +++++++++++++++++++
 site-src/guides/index.md                      | 31 ++++++++++---
 3 files changed, 86 insertions(+), 7 deletions(-)
 create mode 100644 site-src/_includes/epp-sglang.md

diff --git a/config/manifests/gateway/gke/sglang-httproute.yaml b/config/manifests/gateway/gke/sglang-httproute.yaml
index 92cac567b..e92108069 100644
--- a/config/manifests/gateway/gke/sglang-httproute.yaml
+++ b/config/manifests/gateway/gke/sglang-httproute.yaml
@@ -1 +1,18 @@
-# Sample http route for GKE Gateway to route traffic to sglang InferencePool
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llm-route
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.k8s.io
+      kind: InferencePool
+      name: sgl-llama3-8b-instruct
+    matches:
+    - path:
+        type: PathPrefix
+        value: /
diff --git a/site-src/_includes/epp-sglang.md b/site-src/_includes/epp-sglang.md
new file mode 100644
index 000000000..ae9d04fca
--- /dev/null
+++ b/site-src/_includes/epp-sglang.md
@@ -0,0 +1,43 @@
+=== "GKE"
+
+      ```bash
+      export GATEWAY_PROVIDER=gke
+      helm install sglang-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sglang-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "Istio"
+
+      ```bash
+      export GATEWAY_PROVIDER=istio
+      helm install sglang-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sglang-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "Kgateway"
+
+      ```bash
+      export GATEWAY_PROVIDER=none
+      helm install sglang-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sglang-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "NGINX Gateway Fabric"
+
+      ```bash
+      export GATEWAY_PROVIDER=none
+      helm install sglang-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sglang-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
index f823dc20b..0fa98a393 100644
--- a/site-src/guides/index.md
+++ b/site-src/guides/index.md
@@ -30,6 +30,12 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa
     kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/gpu-deployment.yaml
     ```
 
+--8<-- "site-src/_includes/sglang-gpu.md"
+
+   ```bash
+   kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml
+   ```
+
 --8<-- "site-src/_includes/model-server-cpu.md"
 
     ```bash
@@ -42,12 +48,6 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa
     kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/sim-deployment.yaml
     ```
 
---8<-- "site-src/_includes/sglang-gpu.md"
-
-   ```bash
-   kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml
-   ```
-
 ### Install the Inference Extension CRDs
 
 ```bash
@@ -135,6 +135,11 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens
 
 --8<-- "site-src/_includes/epp.md"
 
+   For sglang deployment:
+
+--8<-- "site-src/_includes/epp-sglang.md"
+
+
 ### Deploy an Inference Gateway
 
    Choose one of the following options to deploy an Inference Gateway.
@@ -280,6 +285,12 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens
          kubectl describe inferencepools.inference.networking.k8s.io vllm-llama3-8b-instruct
          ```
 
+         For sglang deployment:
+
+         ```bash
+         kubectl describe inferencepools.inference.networking.k8s.io sglang-llama3-8b-instruct
+        ```
+
          Check that the status shows Accepted=True and ResolvedRefs=True. This confirms the InferencePool is ready to handle traffic.
       
        For more information, see the [NGINX Gateway Fabric - Inference Gateway Setup guide](https://docs.nginx.com/nginx-gateway-fabric/how-to/gateway-api-inference-extension/#overview)
@@ -319,6 +330,14 @@ You have now deployed a basic Inference Gateway with a simple routing strategy.
       kubectl delete secret hf-token --ignore-not-found
       ```
 
+      For Sglang deployment:
+      
+      ```bash
+      helm uninstall sglang-llama3-8b-instruct
+      kubectl delete -f       kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml --ignore-not-found
+      kubectl delete secret hf-token --ignore-not-found
+      ```
+
    1. Uninstall the Gateway API Inference Extension CRDs:
 
       ```bash