From 272e030e1f8074fb045c2a30d04da3189ed9cf5f Mon Sep 17 00:00:00 2001 From: ValentineDragan Date: Mon, 1 Dec 2025 08:12:04 +0000 Subject: [PATCH 1/2] Add queue_message_timeout_duration parameter to endpoints --- .../common/dtos/endpoint_builder.py | 1 + .../model_engine_server/common/dtos/llms.py | 2 ++ .../common/dtos/model_endpoints.py | 2 ++ .../domain/services/model_endpoint_service.py | 1 + .../use_cases/model_endpoint_use_cases.py | 1 + .../live_model_endpoint_infra_gateway.py | 2 ++ .../asb_queue_endpoint_resource_delegate.py | 28 +++++++++++++++++-- .../fake_queue_endpoint_resource_delegate.py | 3 +- .../live_endpoint_resource_gateway.py | 8 +++++- .../queue_endpoint_resource_delegate.py | 6 ++++ .../sqs_queue_endpoint_resource_delegate.py | 8 ++++-- .../services/live_model_endpoint_service.py | 2 ++ 12 files changed, 57 insertions(+), 7 deletions(-) diff --git a/model-engine/model_engine_server/common/dtos/endpoint_builder.py b/model-engine/model_engine_server/common/dtos/endpoint_builder.py index 64ea43d0d..43f92d661 100644 --- a/model-engine/model_engine_server/common/dtos/endpoint_builder.py +++ b/model-engine/model_engine_server/common/dtos/endpoint_builder.py @@ -33,6 +33,7 @@ class BuildEndpointRequest(BaseModel): high_priority: Optional[bool] = None default_callback_url: Optional[str] = None default_callback_auth: Optional[CallbackAuth] = None + queue_message_timeout_duration: Optional[int] = None class BuildEndpointStatus(str, Enum): diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py index 232b8ba6c..5991e79d7 100644 --- a/model-engine/model_engine_server/common/dtos/llms.py +++ b/model-engine/model_engine_server/common/dtos/llms.py @@ -70,6 +70,7 @@ class CreateLLMModelEndpointV1Request(BaseModel): default_callback_url: Optional[HttpUrlStr] = None default_callback_auth: Optional[CallbackAuth] = None public_inference: Optional[bool] = True # LLM endpoints are public by default. + queue_message_timeout_duration: Optional[int] = Field(default=None, ge=1) class CreateLLMModelEndpointV1Response(BaseModel): @@ -137,6 +138,7 @@ class UpdateLLMModelEndpointV1Request(BaseModel): default_callback_url: Optional[HttpUrlStr] = None default_callback_auth: Optional[CallbackAuth] = None public_inference: Optional[bool] = None + queue_message_timeout_duration: Optional[int] = Field(default=None, ge=1) class UpdateLLMModelEndpointV1Response(BaseModel): diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py index e86208904..0f5908a85 100644 --- a/model-engine/model_engine_server/common/dtos/model_endpoints.py +++ b/model-engine/model_engine_server/common/dtos/model_endpoints.py @@ -69,6 +69,7 @@ class CreateModelEndpointV1Request(BaseModel): default_callback_url: Optional[HttpUrlStr] = None default_callback_auth: Optional[CallbackAuth] = None public_inference: Optional[bool] = Field(default=False) + queue_message_timeout_duration: Optional[int] = Field(default=None, ge=1) class CreateModelEndpointV1Response(BaseModel): @@ -95,6 +96,7 @@ class UpdateModelEndpointV1Request(BaseModel): default_callback_url: Optional[HttpUrlStr] = None default_callback_auth: Optional[CallbackAuth] = None public_inference: Optional[bool] = None + queue_message_timeout_duration: Optional[int] = Field(default=None, ge=1) class UpdateModelEndpointV1Response(BaseModel): diff --git a/model-engine/model_engine_server/domain/services/model_endpoint_service.py b/model-engine/model_engine_server/domain/services/model_endpoint_service.py index 4c3471b4f..bc3737004 100644 --- a/model-engine/model_engine_server/domain/services/model_endpoint_service.py +++ b/model-engine/model_engine_server/domain/services/model_endpoint_service.py @@ -90,6 +90,7 @@ async def create_model_endpoint( default_callback_url: Optional[str], default_callback_auth: Optional[CallbackAuth], public_inference: Optional[bool] = False, + queue_message_timeout_duration: Optional[int] = None, ) -> ModelEndpointRecord: """ Creates a model endpoint. diff --git a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py index 9d3553076..91349514b 100644 --- a/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py @@ -313,6 +313,7 @@ async def execute( default_callback_url=request.default_callback_url, default_callback_auth=request.default_callback_auth, public_inference=request.public_inference, + queue_message_timeout_duration=request.queue_message_timeout_duration, ) _handle_post_inference_hooks( created_by=user.user_id, diff --git a/model-engine/model_engine_server/infra/gateways/live_model_endpoint_infra_gateway.py b/model-engine/model_engine_server/infra/gateways/live_model_endpoint_infra_gateway.py index 4b73a3860..71bfd3bd7 100644 --- a/model-engine/model_engine_server/infra/gateways/live_model_endpoint_infra_gateway.py +++ b/model-engine/model_engine_server/infra/gateways/live_model_endpoint_infra_gateway.py @@ -73,6 +73,7 @@ def create_model_endpoint_infra( billing_tags: Optional[Dict[str, Any]] = None, default_callback_url: Optional[str], default_callback_auth: Optional[CallbackAuth], + queue_message_timeout_duration: Optional[int] = None, ) -> str: deployment_name = generate_deployment_name( model_endpoint_record.created_by, model_endpoint_record.name @@ -99,6 +100,7 @@ def create_model_endpoint_infra( billing_tags=billing_tags, default_callback_url=default_callback_url, default_callback_auth=default_callback_auth, + queue_message_timeout_duration=queue_message_timeout_duration, ) response = self.task_queue_gateway.send_task( task_name=BUILD_TASK_NAME, diff --git a/model-engine/model_engine_server/infra/gateways/resources/asb_queue_endpoint_resource_delegate.py b/model-engine/model_engine_server/infra/gateways/resources/asb_queue_endpoint_resource_delegate.py index 3799ed654..8a224c5fa 100644 --- a/model-engine/model_engine_server/infra/gateways/resources/asb_queue_endpoint_resource_delegate.py +++ b/model-engine/model_engine_server/infra/gateways/resources/asb_queue_endpoint_resource_delegate.py @@ -1,5 +1,6 @@ import os -from typing import Any, Dict +from datetime import timedelta +from typing import Any, Dict, Optional from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError from azure.identity import DefaultAzureCredential @@ -32,13 +33,36 @@ async def create_queue_if_not_exists( endpoint_name: str, endpoint_created_by: str, endpoint_labels: Dict[str, Any], + queue_message_timeout_duration: Optional[int] = None, ) -> QueueInfo: queue_name = QueueEndpointResourceDelegate.endpoint_id_to_queue_name(endpoint_id) + timeout_duration = queue_message_timeout_duration or 60 # Default to 60 seconds + + # Validation: Azure Service Bus lock duration must be <= 5 minutes (300s) + if timeout_duration > 300: + raise ValueError(f"queue_message_timeout_duration ({timeout_duration}s) exceeds Azure Service Bus maximum of 300 seconds") + with _get_servicebus_administration_client() as client: try: + # First, try to create the queue with default properties client.create_queue(queue_name=queue_name) + + # Then update the queue properties to set custom lock duration + queue_properties = client.get_queue(queue_name) + queue_properties.lock_duration = timedelta(seconds=timeout_duration) + client.update_queue(queue_properties) + except ResourceExistsError: - pass + # Queue already exists, update its properties if needed + try: + queue_properties = client.get_queue(queue_name) + # Only update if the lock duration is different + if queue_properties.lock_duration != timedelta(seconds=timeout_duration): + queue_properties.lock_duration = timedelta(seconds=timeout_duration) + client.update_queue(queue_properties) + except Exception as e: + # If we can't update properties, log but don't fail + logger.warning(f"Could not update queue properties for {queue_name}: {e}") return QueueInfo(queue_name, None) diff --git a/model-engine/model_engine_server/infra/gateways/resources/fake_queue_endpoint_resource_delegate.py b/model-engine/model_engine_server/infra/gateways/resources/fake_queue_endpoint_resource_delegate.py index 9ded2d6e5..b43e5c4cc 100644 --- a/model-engine/model_engine_server/infra/gateways/resources/fake_queue_endpoint_resource_delegate.py +++ b/model-engine/model_engine_server/infra/gateways/resources/fake_queue_endpoint_resource_delegate.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Sequence +from typing import Any, Dict, Optional, Sequence from model_engine_server.infra.gateways.resources.queue_endpoint_resource_delegate import ( QueueEndpointResourceDelegate, @@ -15,6 +15,7 @@ async def create_queue_if_not_exists( endpoint_name: str, endpoint_created_by: str, endpoint_labels: Dict[str, Any], + queue_message_timeout_duration: Optional[int] = None, ) -> QueueInfo: queue_name = QueueEndpointResourceDelegate.endpoint_id_to_queue_name(endpoint_id) queue_url = f"http://foobar.com/{queue_name}" diff --git a/model-engine/model_engine_server/infra/gateways/resources/live_endpoint_resource_gateway.py b/model-engine/model_engine_server/infra/gateways/resources/live_endpoint_resource_gateway.py index fb637c10f..61089a60b 100644 --- a/model-engine/model_engine_server/infra/gateways/resources/live_endpoint_resource_gateway.py +++ b/model-engine/model_engine_server/infra/gateways/resources/live_endpoint_resource_gateway.py @@ -38,6 +38,7 @@ async def create_queue( self, endpoint_record: ModelEndpointRecord, labels: Dict[str, str], + queue_message_timeout_duration: Optional[int] = None, ) -> QueueInfo: """Creates a new queue, returning its unique name and queue URL.""" queue_name, queue_url = await self.queue_delegate.create_queue_if_not_exists( @@ -45,6 +46,7 @@ async def create_queue( endpoint_name=endpoint_record.name, endpoint_created_by=endpoint_record.created_by, endpoint_labels=labels, + queue_message_timeout_duration=queue_message_timeout_duration, ) return QueueInfo(queue_name, queue_url) @@ -56,7 +58,11 @@ async def create_or_update_resources( request.build_endpoint_request.model_endpoint_record.endpoint_type == ModelEndpointType.ASYNC ): - q = await self.create_queue(endpoint_record, request.build_endpoint_request.labels) + q = await self.create_queue( + endpoint_record, + request.build_endpoint_request.labels, + request.build_endpoint_request.queue_message_timeout_duration + ) queue_name: Optional[str] = q.queue_name queue_url: Optional[str] = q.queue_url destination: str = q.queue_name diff --git a/model-engine/model_engine_server/infra/gateways/resources/queue_endpoint_resource_delegate.py b/model-engine/model_engine_server/infra/gateways/resources/queue_endpoint_resource_delegate.py index 76c77e64b..4998b2959 100644 --- a/model-engine/model_engine_server/infra/gateways/resources/queue_endpoint_resource_delegate.py +++ b/model-engine/model_engine_server/infra/gateways/resources/queue_endpoint_resource_delegate.py @@ -24,9 +24,15 @@ async def create_queue_if_not_exists( endpoint_name: str, endpoint_created_by: str, endpoint_labels: Dict[str, Any], + queue_message_timeout_duration: Optional[int] = None, ) -> QueueInfo: """ Creates a queue associated with the given endpoint_id. Other fields are set as tags on the queue. + + Args: + queue_message_timeout_duration: Optional timeout duration in seconds for queue messages. + For SQS, this sets the VisibilityTimeout. + For Azure Service Bus, this sets the lock_duration (max 300 seconds). """ @abstractmethod diff --git a/model-engine/model_engine_server/infra/gateways/resources/sqs_queue_endpoint_resource_delegate.py b/model-engine/model_engine_server/infra/gateways/resources/sqs_queue_endpoint_resource_delegate.py index 748c3f699..8953d4ae4 100644 --- a/model-engine/model_engine_server/infra/gateways/resources/sqs_queue_endpoint_resource_delegate.py +++ b/model-engine/model_engine_server/infra/gateways/resources/sqs_queue_endpoint_resource_delegate.py @@ -55,7 +55,11 @@ async def create_queue_if_not_exists( endpoint_name: str, endpoint_created_by: str, endpoint_labels: Dict[str, Any], + queue_message_timeout_duration: Optional[int] = None, ) -> QueueInfo: + # Use provided timeout or default to 43200 (12 hours, max SQS visibility) + timeout_duration = queue_message_timeout_duration or 43200 + async with _create_async_sqs_client(sqs_profile=self.sqs_profile) as sqs_client: queue_name = QueueEndpointResourceDelegate.endpoint_id_to_queue_name(endpoint_id) @@ -73,9 +77,7 @@ async def create_queue_if_not_exists( create_response = await sqs_client.create_queue( QueueName=queue_name, Attributes=dict( - VisibilityTimeout="43200", - # To match current hardcoded Celery timeout of 24hr - # However, the max SQS visibility is 12hrs. + VisibilityTimeout=str(timeout_duration), Policy=_get_queue_policy(queue_name=queue_name), ), tags=_get_queue_tags( diff --git a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py index 475fbca86..8fcc83c68 100644 --- a/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py +++ b/model-engine/model_engine_server/infra/services/live_model_endpoint_service.py @@ -160,6 +160,7 @@ async def create_model_endpoint( default_callback_url: Optional[str] = None, default_callback_auth: Optional[CallbackAuth], public_inference: Optional[bool] = False, + queue_message_timeout_duration: Optional[int] = None, ) -> ModelEndpointRecord: existing_endpoints = ( await self.model_endpoint_record_repository.list_model_endpoint_records( @@ -203,6 +204,7 @@ async def create_model_endpoint( high_priority=high_priority, default_callback_url=default_callback_url, default_callback_auth=default_callback_auth, + queue_message_timeout_duration=queue_message_timeout_duration, ) await self.model_endpoint_record_repository.update_model_endpoint_record( model_endpoint_id=model_endpoint_record.id, From 16c2632d5444bfe594d6805347637aae7c69c780 Mon Sep 17 00:00:00 2001 From: ValentineDragan Date: Fri, 19 Dec 2025 10:15:34 +0000 Subject: [PATCH 2/2] Add directory with rollback instructions and helpers --- UPGRADE-PLAN.md | 331 ++++++++++++++++++ backups/ROLLBACK-INSTRUCTIONS.md | 105 ++++++ backups/backup-timestamp.txt | 1 + backups/current-env-vars.txt | 60 ++++ backups/current-image.txt | 1 + .../endpoint-builder-deployment-backup.yaml | 201 +++++++++++ backups/health-checks.txt | 21 ++ backups/helm-history.txt | 2 + backups/helm-values-backup.yaml | 149 ++++++++ backups/model-engine-deployment-backup.yaml | 196 +++++++++++ backups/quick-rollback.sh | 53 +++ 11 files changed, 1120 insertions(+) create mode 100644 UPGRADE-PLAN.md create mode 100644 backups/ROLLBACK-INSTRUCTIONS.md create mode 100644 backups/backup-timestamp.txt create mode 100644 backups/current-env-vars.txt create mode 100644 backups/current-image.txt create mode 100644 backups/endpoint-builder-deployment-backup.yaml create mode 100644 backups/health-checks.txt create mode 100644 backups/helm-history.txt create mode 100644 backups/helm-values-backup.yaml create mode 100644 backups/model-engine-deployment-backup.yaml create mode 100755 backups/quick-rollback.sh diff --git a/UPGRADE-PLAN.md b/UPGRADE-PLAN.md new file mode 100644 index 000000000..b08266c83 --- /dev/null +++ b/UPGRADE-PLAN.md @@ -0,0 +1,331 @@ +# Model Engine Upgrade Plan - Queue Timeout Feature + +## 📋 Current State Analysis + +### Cluster Information +- **Cluster:** `sgpaz98031021k8s` +- **Resource Group:** `SGP98031021` +- **Namespace:** `launch` +- **Helm Release:** `launch-inference` (revision 1) +- **Chart Version:** `model-engine-0.1.13` + +### Current Deployments +1. **model-engine** (Gateway) + - Replicas: 2 + - Image: `022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984` + - Health check: `/readyz` on port 5000 + +2. **model-engine-endpoint-builder** + - Replicas: 1 + - Image: Same as gateway + +3. **model-engine-cacher** + - Replicas: 0 (scaled down) + +### Key Configuration (From Helm Values) +```yaml +azure: + abs_account_name: sgpaz98031021storage + abs_container_name: sgpaz98031021models + client_id: 1ef7e168-08e1-4798-b29b-7f5f9bd048ea + servicebus_namespace: sgpaz98031021llm-engine + keyvault_name: sgpaz98031021keyvault + +config: + values: + infra: + cloud_provider: azure + docker_repo_prefix: 022465994601.dkr.ecr.us-west-2.amazonaws.com + k8s_cluster_name: sgpaz98031021k8s + launch: + cache_redis_azure_host: sgpaz98031021rediscache.redis.cache.windows.net:6380 + endpoint_namespace: launch-inference +``` + +--- + +## 🎯 Upgrade Strategy: Rolling Update with New Image + +### Why This Approach? +✅ **Preserves all existing configuration** (env vars, secrets, volumes) +✅ **Zero downtime** with rolling update +✅ **Easy rollback** via Helm or kubectl +✅ **Minimal risk** - only image changes + +### What Will Change? +- ✅ Docker image tag (new image with your code) +- ❌ **NO** environment variables +- ❌ **NO** secrets or credentials +- ❌ **NO** ConfigMaps +- ❌ **NO** resource limits +- ❌ **NO** replica counts + +--- + +## 📝 Step-by-Step Upgrade Process + +### Phase 1: Build New Image +```bash +cd /home/20.scai.v.dragan/repos/llm-engine + +# Build image with your code changes +docker build -t 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:queue-timeout-$(date +%Y%m%d-%H%M%S) \ + -f model-engine/Dockerfile . +``` + +### Phase 2: Push to ECR +```bash +# Login to ECR +aws ecr get-login-password --region us-west-2 | \ + docker login --username AWS --password-stdin 022465994601.dkr.ecr.us-west-2.amazonaws.com + +# Push image +docker push 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:queue-timeout-TIMESTAMP +``` + +### Phase 3: Upgrade Helm Release +```bash +# Upgrade with new image tag ONLY +helm upgrade launch-inference /home/20.scai.v.dragan/repos/llm-engine/charts/model-engine \ + --namespace launch \ + --set tag=queue-timeout-TIMESTAMP \ + --reuse-values \ + --wait \ + --timeout 10m +``` + +**CRITICAL:** The `--reuse-values` flag ensures ALL existing values (including env vars, secrets, etc.) are preserved! + +### Phase 4: Monitor Deployment +```bash +# Watch the rollout +kubectl rollout status deployment/model-engine -n launch -w + +# In another terminal, watch pods +kubectl get pods -n launch -w | grep model-engine + +# Check logs +kubectl logs -f deployment/model-engine -n launch +``` + +--- + +## 🛡️ Safety Measures + +### ✅ Backups Created +All backups are in `/home/20.scai.v.dragan/repos/llm-engine/backups/`: +- `helm-values-backup.yaml` - Current Helm values +- `model-engine-deployment-backup.yaml` - Gateway deployment +- `endpoint-builder-deployment-backup.yaml` - Builder deployment +- `current-image.txt` - Current image tag +- `health-checks.txt` - Health check configuration +- `current-env-vars.txt` - Environment variables +- `quick-rollback.sh` - Executable rollback script + +### ✅ Rollback Options + +**Option 1: Helm Rollback (Recommended)** +```bash +cd /home/20.scai.v.dragan/repos/llm-engine/backups +./quick-rollback.sh +``` + +**Option 2: Manual kubectl** +```bash +kubectl set image deployment/model-engine \ + model-engine=022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984 \ + -n launch +``` + +### ✅ Rolling Update Configuration +Current strategy: +- `maxSurge: 25%` - Can create 1 extra pod during update (2 replicas * 0.25 = 0.5, rounded up to 1) +- `maxUnavailable: 0` - Always keep at least 2 pods running +- **Result:** Zero downtime guaranteed + +--- + +## ⚠️ Potential Failure Scenarios & Mitigations + +### 1. Image Pull Failure +**Symptoms:** Pods stuck in `ImagePullBackOff` + +**Mitigation:** +- Test image pull before upgrade: + ```bash + docker pull 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:queue-timeout-TIMESTAMP + ``` + +**Rollback:** Automatic - old pods keep running + +--- + +### 2. Health Check Failure +**Symptoms:** New pods never become Ready + +**Mitigation:** +- Health check endpoint: `/readyz` on port 5000 +- Timeout: 1 second +- Failure threshold: 30 attempts +- **Your code must respond to `/readyz` within 1 second** + +**Rollback:** Run `./quick-rollback.sh` + +--- + +### 3. Code Crashes on Startup +**Symptoms:** Pods crash loop with `CrashLoopBackOff` + +**Mitigation:** +- Check logs immediately: + ```bash + kubectl logs -l app=model-engine -n launch --tail=100 + ``` + +**Rollback:** Run `./quick-rollback.sh` + +--- + +### 4. Configuration Mismatch +**Symptoms:** Pods run but API errors + +**Mitigation:** +- Your code changes should be **backward compatible** +- Don't change: + - API endpoints + - Environment variable names + - Database schema + - ConfigMap structure + +**Rollback:** Run `./quick-rollback.sh` + +--- + +## ✅ Pre-Upgrade Checklist + +Before running the upgrade: + +- [ ] All backups created (check `/backups/` directory) +- [ ] Rollback script tested (`./quick-rollback.sh --help`) +- [ ] Docker image built successfully +- [ ] Image pushed to ECR +- [ ] Image can be pulled from ECR +- [ ] Code changes are backward compatible +- [ ] Health check endpoint (`/readyz`) works in your code +- [ ] No database migrations required +- [ ] Team notified about upgrade + +--- + +## 📊 Monitoring During Upgrade + +### Terminal 1: Watch Rollout +```bash +kubectl rollout status deployment/model-engine -n launch -w +``` + +### Terminal 2: Watch Pods +```bash +kubectl get pods -n launch -w | grep model-engine +``` + +### Terminal 3: Watch Logs +```bash +kubectl logs -f deployment/model-engine -n launch +``` + +### Terminal 4: Watch Events +```bash +kubectl get events -n launch --sort-by='.lastTimestamp' -w | grep model-engine +``` + +--- + +## 🎉 Post-Upgrade Verification + +After upgrade completes: + +```bash +# 1. Check all pods are running +kubectl get pods -n launch | grep model-engine + +# 2. Verify new image is deployed +kubectl get deployment model-engine -n launch -o jsonpath='{.spec.template.spec.containers[0].image}' + +# 3. Check pod health +kubectl get pods -n launch -l app=model-engine -o wide + +# 4. Test API health endpoint +kubectl port-forward -n launch svc/model-engine 8080:80 +# In another terminal: +curl http://localhost:8080/health +curl http://localhost:8080/readyz + +# 5. Test queue timeout feature +python /home/20.scai.v.dragan/repos/llm-engine/test_complete_queue_timeout_flow.py + +# 6. Verify Azure Service Bus queues +az servicebus queue list \ + --namespace-name sgpaz98031021llm-engine \ + --resource-group SGP98031021 \ + --query "[].{name:name, lockDuration:lockDuration}" \ + --output table +``` + +--- + +## 🔄 Environment Variables - What's Preserved? + +**The `--reuse-values` flag ensures these are NOT changed:** + +From current deployment: +- Azure credentials (`client_id`, `identity_name`, etc.) +- Service Bus namespace +- Redis configuration +- Storage account settings +- All secrets and ConfigMaps +- Resource limits +- Replica counts +- Health check configuration + +**Only the image tag changes!** + +--- + +## 📞 Emergency Contacts + +If something goes wrong: +1. **Immediate:** Run `./backups/quick-rollback.sh` +2. **Check logs:** `kubectl logs -l app=model-engine -n launch --tail=100` +3. **Check events:** `kubectl get events -n launch | grep model-engine` +4. **Escalate:** Contact team lead + +--- + +## 🎯 Success Criteria + +Upgrade is successful when: +- ✅ All pods are Running and Ready +- ✅ Health checks passing (`/readyz` returns 200) +- ✅ API endpoints responding +- ✅ Can create endpoints with `queue_message_timeout_duration` +- ✅ Azure Service Bus queues created with correct lock duration +- ✅ No error logs in pods +- ✅ Helm release shows as "deployed" + +--- + +## Next Steps + +Ready to proceed? Run: +```bash +cd /home/20.scai.v.dragan/repos/llm-engine +./build_and_deploy_queue_timeout.sh +``` + +This script will: +1. Build the Docker image with your changes +2. Push to ECR +3. Upgrade the Helm release +4. Monitor the deployment +5. Verify success diff --git a/backups/ROLLBACK-INSTRUCTIONS.md b/backups/ROLLBACK-INSTRUCTIONS.md new file mode 100644 index 000000000..ce084ccf5 --- /dev/null +++ b/backups/ROLLBACK-INSTRUCTIONS.md @@ -0,0 +1,105 @@ +# Rollback Instructions + +**Backup Created:** $(cat backup-timestamp.txt) +**Current Image:** `022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984` +**Helm Release:** `launch-inference` +**Namespace:** `launch` +**Current Revision:** 1 + +--- + +## 🚨 Quick Rollback (Recommended) + +If the upgrade fails, use Helm rollback: + +```bash +# Rollback to previous revision (revision 1) +helm rollback launch-inference 1 -n launch + +# Monitor the rollback +kubectl rollout status deployment/model-engine -n launch +kubectl rollout status deployment/model-engine-endpoint-builder -n launch + +# Verify pods are healthy +kubectl get pods -n launch | grep model-engine +``` + +--- + +## 🔧 Manual Rollback (If Helm Fails) + +If Helm rollback doesn't work, manually set the image: + +```bash +# Rollback model-engine deployment +kubectl set image deployment/model-engine \ + model-engine=022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984 \ + -n launch + +# Rollback endpoint-builder deployment +kubectl set image deployment/model-engine-endpoint-builder \ + model-engine-endpoint-builder=022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984 \ + -n launch + +# Monitor rollback +kubectl rollout status deployment/model-engine -n launch +kubectl rollout status deployment/model-engine-endpoint-builder -n launch +``` + +--- + +## 🆘 Emergency Restore (Nuclear Option) + +If everything fails, restore from backup files: + +```bash +cd /home/20.scai.v.dragan/repos/llm-engine/backups + +# Delete current deployments +kubectl delete deployment model-engine -n launch +kubectl delete deployment model-engine-endpoint-builder -n launch + +# Wait for deletion +sleep 10 + +# Restore from backup +kubectl apply -f model-engine-deployment-backup.yaml +kubectl apply -f endpoint-builder-deployment-backup.yaml + +# Wait for pods to be ready +kubectl wait --for=condition=ready pod -l app=model-engine -n launch --timeout=300s +``` + +--- + +## ✅ Verification After Rollback + +```bash +# Check pods are running +kubectl get pods -n launch | grep model-engine + +# Check image is correct +kubectl get deployment model-engine -n launch -o jsonpath='{.spec.template.spec.containers[0].image}' + +# Check logs for errors +kubectl logs -l app=model-engine -n launch --tail=50 + +# Test the API +kubectl port-forward -n launch svc/model-engine 8080:80 +# Then in another terminal: curl http://localhost:8080/health +``` + +--- + +## 📊 Monitoring Commands + +```bash +# Watch pods during rollback +kubectl get pods -n launch -w | grep model-engine + +# Check events +kubectl get events -n launch --sort-by='.lastTimestamp' | grep model-engine + +# View logs +kubectl logs -f deployment/model-engine -n launch +``` diff --git a/backups/backup-timestamp.txt b/backups/backup-timestamp.txt new file mode 100644 index 000000000..4e4fc97e1 --- /dev/null +++ b/backups/backup-timestamp.txt @@ -0,0 +1 @@ +Tue Nov 11 18:12:52 UTC 2025 diff --git a/backups/current-env-vars.txt b/backups/current-env-vars.txt new file mode 100644 index 000000000..8f3678f8f --- /dev/null +++ b/backups/current-env-vars.txt @@ -0,0 +1,60 @@ + tags.datadoghq.com/env: production + tags.datadoghq.com/service: model-engine + tags.datadoghq.com/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + team: infra + name: model-engine + namespace: launch + resourceVersion: "230220698" + uid: 8b6561e8-447a-4379-b2e7-36fb01553100 +spec: + progressDeadlineSeconds: 600 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: model-engine + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 0 + type: RollingUpdate + template: + metadata: + annotations: + ad.datadoghq.com/main.logs: | + [{ + "service": "model-engine", + "source": "python" + }] + creationTimestamp: null + labels: + app: model-engine + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + azure.workload.identity/use: "true" + helm.sh/chart: model-engine-0.1.13 + product: model-engine + tags.datadoghq.com/env: production + tags.datadoghq.com/service: model-engine + tags.datadoghq.com/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + team: infra + spec: + containers: + - args: + - python + - -m + - model_engine_server.entrypoints.start_fastapi_server + command: + - dumb-init + - -- + env: + - name: DD_TRACE_ENABLED + value: "true" + - name: DD_REMOTE_CONFIGURATION_ENABLED + value: "false" + - name: DD_ENV + value: production + - name: DD_AGENT_HOST + valueFrom: + fieldRef: + apiVersion: v1 diff --git a/backups/current-image.txt b/backups/current-image.txt new file mode 100644 index 000000000..41d55973a --- /dev/null +++ b/backups/current-image.txt @@ -0,0 +1 @@ +022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984 \ No newline at end of file diff --git a/backups/endpoint-builder-deployment-backup.yaml b/backups/endpoint-builder-deployment-backup.yaml new file mode 100644 index 000000000..df3132f6b --- /dev/null +++ b/backups/endpoint-builder-deployment-backup.yaml @@ -0,0 +1,201 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + annotations: + deployment.kubernetes.io/revision: "1" + meta.helm.sh/release-name: launch-inference + meta.helm.sh/release-namespace: launch + creationTimestamp: "2025-03-10T20:54:36Z" + generation: 1 + labels: + app: model-engine-endpoint-builder + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + azure.workload.identity/use: "true" + helm.sh/chart: model-engine-0.1.13 + helm.toolkit.fluxcd.io/name: launch-inference + helm.toolkit.fluxcd.io/namespace: launch + product: model-engine + tags.datadoghq.com/env: production + tags.datadoghq.com/service: model-engine-endpoint-builder + tags.datadoghq.com/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + team: infra + name: model-engine-endpoint-builder + namespace: launch + resourceVersion: "283977885" + uid: 3948f7ac-3080-47ed-9c4c-2dd50b6f33e4 +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: model-engine-endpoint-builder + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + annotations: + ad.datadoghq.com/main.logs: | + [{ + "service": "\"model-engine-endpoint-builder\"", + "source": "python" + }] + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + creationTimestamp: null + labels: + app: model-engine-endpoint-builder + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + azure.workload.identity/use: "true" + helm.sh/chart: model-engine-0.1.13 + product: model-engine + sidecar.istio.io/inject: "false" + tags.datadoghq.com/env: production + tags.datadoghq.com/service: model-engine-endpoint-builder + tags.datadoghq.com/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + team: infra + spec: + containers: + - args: + - celery + - --app=model_engine_server.service_builder + - worker + - --loglevel=INFO + - --concurrency=2 + - --queues=model-engine-service-builder + command: + - dumb-init + - -- + env: + - name: DD_TRACE_ENABLED + value: "true" + - name: DD_REMOTE_CONFIGURATION_ENABLED + value: "false" + - name: DD_ENV + value: production + - name: DD_AGENT_HOST + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: SERVICE_IDENTIFIER + - name: GATEWAY_URL + value: http://model-engine.launch:80 + - name: DB_SECRET_NAME + value: prod-ml-infra-pg + - name: DEPLOY_SERVICE_CONFIG_PATH + value: /workspace/model-engine/service_configs/service_config.yaml + - name: ML_INFRA_SERVICES_CONFIG_PATH + value: /workspace/model-engine/model_engine_server/core/configs/config.yaml + - name: CELERY_ELASTICACHE_ENABLED + value: "true" + - name: LAUNCH_SERVICE_TEMPLATE_FOLDER + value: /workspace/model-engine/model_engine_server/infra/gateways/resources/templates + - name: AZURE_IDENTITY_NAME + value: sgpaz98031021-uai-model-engine + - name: AZURE_CLIENT_ID + value: 1ef7e168-08e1-4798-b29b-7f5f9bd048ea + - name: AZURE_OBJECT_ID + value: 84f54dea-e1dc-42a6-8428-a5ec93cd1699 + - name: KEYVAULT_NAME + value: sgpaz98031021keyvault + - name: ABS_ACCOUNT_NAME + value: sgpaz98031021storage + - name: ABS_CONTAINER_NAME + value: sgpaz98031021models + - name: SERVICEBUS_NAMESPACE + value: sgpaz98031021llm-engine + - name: DD_VERSION + value: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + - name: GIT_TAG + value: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + - name: DD_SERVICE + value: model-engine-endpoint-builder + image: 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + imagePullPolicy: IfNotPresent + name: model-engine-endpoint-builder + ports: + - containerPort: 5000 + name: http + protocol: TCP + readinessProbe: + exec: + command: + - cat + - /tmp/readyz + failureThreshold: 3 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: "2" + ephemeral-storage: 256Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /workspace/model-engine/model_engine_server/infra/gateways/resources/templates + name: service-template-config + - mountPath: /workspace/model-engine/service_configs + name: model-engine-service-config-volume + - mountPath: /workspace/model-engine/model_engine_server/core/configs + name: infra-service-config-volume + dnsPolicy: ClusterFirst + imagePullSecrets: + - name: egp-ecr-regcred + nodeSelector: + node-lifecycle: normal + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + serviceAccount: model-engine + serviceAccountName: model-engine + terminationGracePeriodSeconds: 30 + volumes: + - emptyDir: + medium: Memory + name: dshm + - configMap: + defaultMode: 420 + name: model-engine-service-template-config + name: service-template-config + - configMap: + defaultMode: 420 + items: + - key: launch_service_config + path: service_config.yaml + name: model-engine-service-config + name: model-engine-service-config-volume + - configMap: + defaultMode: 420 + items: + - key: infra_service_config + path: config.yaml + name: model-engine-service-config + name: infra-service-config-volume +status: + availableReplicas: 1 + conditions: + - lastTransitionTime: "2025-03-10T20:54:36Z" + lastUpdateTime: "2025-03-10T20:59:46Z" + message: ReplicaSet "model-engine-endpoint-builder-79544dd49b" has successfully + progressed. + reason: NewReplicaSetAvailable + status: "True" + type: Progressing + - lastTransitionTime: "2025-11-04T18:35:29Z" + lastUpdateTime: "2025-11-04T18:35:29Z" + message: Deployment has minimum availability. + reason: MinimumReplicasAvailable + status: "True" + type: Available + observedGeneration: 1 + readyReplicas: 1 + replicas: 1 + updatedReplicas: 1 diff --git a/backups/health-checks.txt b/backups/health-checks.txt new file mode 100644 index 000000000..0c041f863 --- /dev/null +++ b/backups/health-checks.txt @@ -0,0 +1,21 @@ + readinessProbe: + failureThreshold: 30 + httpGet: + path: /readyz + port: 5000 + scheme: HTTP + periodSeconds: 2 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: "2" + ephemeral-storage: 256Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /workspace/model-engine/model_engine_server/infra/gateways/resources/templates + name: service-template-config + - mountPath: /workspace/model-engine/service_configs diff --git a/backups/helm-history.txt b/backups/helm-history.txt new file mode 100644 index 000000000..277a277ab --- /dev/null +++ b/backups/helm-history.txt @@ -0,0 +1,2 @@ +REVISION UPDATED STATUS CHART APP VERSION DESCRIPTION +1 Mon Mar 10 20:52:21 2025 deployed model-engine-0.1.13 1.0.0 Install complete diff --git a/backups/helm-values-backup.yaml b/backups/helm-values-backup.yaml new file mode 100644 index 000000000..4046cb19e --- /dev/null +++ b/backups/helm-values-backup.yaml @@ -0,0 +1,149 @@ +USER-SUPPLIED VALUES: +affinity: {} +autoscaling: + horizontal: + enabled: true + maxReplicas: 10 + minReplicas: 2 + targetConcurrency: 50 + prewarming: + enabled: false + vertical: + enabled: false +azure: + abs_account_name: sgpaz98031021storage + abs_container_name: sgpaz98031021models + client_id: 1ef7e168-08e1-4798-b29b-7f5f9bd048ea + identity_name: sgpaz98031021-uai-model-engine + inference_client_id: ed30f185-30bf-44e8-9432-d76690ea4223 + keyvault_name: sgpaz98031021keyvault + object_id: 84f54dea-e1dc-42a6-8428-a5ec93cd1699 + servicebus_namespace: sgpaz98031021llm-engine +celery_autoscaler: + enabled: true + num_shards: 3 +celeryBrokerType: servicebus +config: + values: + infra: + cloud_provider: azure + default_region: default + dns_host_domain: llm-engine.domain.com + docker_repo_prefix: 022465994601.dkr.ecr.us-west-2.amazonaws.com + k8s_cluster_name: sgpaz98031021k8s + ml_account_id: "" + redis_host: sgpaz98031021rediscache.redis.cache.windows.net + s3_bucket: "" + launch: + batch_inference_vllm_repository: llm-engine/batch-infer-vllm + billing_queue_arn: unused + cache_redis_azure_host: sgpaz98031021rediscache.redis.cache.windows.net:6380 + cloud_file_llm_fine_tune_repository: azure://sgpaz98031021models/hosted-model-inference/llm-ft-job-repository + dd_trace_enabled: false + docker_image_layer_cache_repository: scale-egp-98031021-kaniko-cache + endpoint_namespace: launch-inference + hf_user_fine_tuned_weights_prefix: https://sgpaz98031021storage.blob.core.windows.net/sgpaz98031021models/hosted-model-inference/fine_tuned_weights + istio_enabled: true + lightllm_repository: lightllm + model_primitive_host: unused + sensitive_log_mode: true + sqs_profile: "" + sqs_queue_policy_template: "" + sqs_queue_tag_template: "" + tensorrt_llm_repository: tensorrt-llm + tgi_repository: text-generation-inference + user_inference_base_repository: scale-egp-98031021-launch/inference + user_inference_pytorch_repository: scale-egp-98031021-launch/inference + user_inference_tensorflow_repository: unused + vllm_repository: vllm +context: production +datadog: + enabled: false +db: + runDbInitScript: true +destinationrule: + annotations: {} + enabled: true +hostDomain: + prefix: http:// +image: + builderRepository: 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine + cacherRepository: 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine + forwarderRepository: 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine + gatewayRepository: 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine + pullPolicy: IfNotPresent +imageCache: + devices: + - name: cpu + nodeSelector: + cpu-only: "true" + - name: a10 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-ampere-a10 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + - name: a100 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-ampere-a100 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + - name: t4 + nodeSelector: + k8s.amazonaws.com/accelerator: nvidia-tesla-t4 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists +imagePullSecrets: +- name: egp-ecr-regcred +populateFineTuningRepository: true +replicaCount: + balloonA10: 0 + balloonA100: 0 + balloonCpu: 0 + balloonT4: 0 + builder: 1 + cacher: 0 + gateway: 2 +resources: + requests: + cpu: 2 + ephemeral-storage: 256Mi +restartKedaOperator: true +secrets: + cloudDatabaseSecretName: prod-ml-infra-pg +service: + port: 80 + type: ClusterIP +serviceAccount: + annotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-2" + namespaces: + - model-engine +serviceTemplate: + createServiceAccount: true + mountInfraConfig: true + securityContext: + capabilities: + drop: + - all + serviceAccountAnnotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-2" + serviceAccountName: model-engine +tag: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 +tolerations: [] +triton: + image: + repository: unused + tag: unused +virtualservice: + annotations: {} + enabled: true + gateways: [] + hostDomains: [] diff --git a/backups/model-engine-deployment-backup.yaml b/backups/model-engine-deployment-backup.yaml new file mode 100644 index 000000000..700a1c95e --- /dev/null +++ b/backups/model-engine-deployment-backup.yaml @@ -0,0 +1,196 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + annotations: + deployment.kubernetes.io/revision: "1" + meta.helm.sh/release-name: launch-inference + meta.helm.sh/release-namespace: launch + creationTimestamp: "2025-03-10T20:54:36Z" + generation: 6 + labels: + app: model-engine + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + azure.workload.identity/use: "true" + helm.sh/chart: model-engine-0.1.13 + helm.toolkit.fluxcd.io/name: launch-inference + helm.toolkit.fluxcd.io/namespace: launch + product: model-engine + tags.datadoghq.com/env: production + tags.datadoghq.com/service: model-engine + tags.datadoghq.com/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + team: infra + name: model-engine + namespace: launch + resourceVersion: "230220698" + uid: 8b6561e8-447a-4379-b2e7-36fb01553100 +spec: + progressDeadlineSeconds: 600 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: model-engine + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 0 + type: RollingUpdate + template: + metadata: + annotations: + ad.datadoghq.com/main.logs: | + [{ + "service": "model-engine", + "source": "python" + }] + creationTimestamp: null + labels: + app: model-engine + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + azure.workload.identity/use: "true" + helm.sh/chart: model-engine-0.1.13 + product: model-engine + tags.datadoghq.com/env: production + tags.datadoghq.com/service: model-engine + tags.datadoghq.com/version: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + team: infra + spec: + containers: + - args: + - python + - -m + - model_engine_server.entrypoints.start_fastapi_server + command: + - dumb-init + - -- + env: + - name: DD_TRACE_ENABLED + value: "true" + - name: DD_REMOTE_CONFIGURATION_ENABLED + value: "false" + - name: DD_ENV + value: production + - name: DD_AGENT_HOST + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: SERVICE_IDENTIFIER + - name: GATEWAY_URL + value: http://model-engine.launch:80 + - name: DB_SECRET_NAME + value: prod-ml-infra-pg + - name: DEPLOY_SERVICE_CONFIG_PATH + value: /workspace/model-engine/service_configs/service_config.yaml + - name: ML_INFRA_SERVICES_CONFIG_PATH + value: /workspace/model-engine/model_engine_server/core/configs/config.yaml + - name: CELERY_ELASTICACHE_ENABLED + value: "true" + - name: LAUNCH_SERVICE_TEMPLATE_FOLDER + value: /workspace/model-engine/model_engine_server/infra/gateways/resources/templates + - name: AZURE_IDENTITY_NAME + value: sgpaz98031021-uai-model-engine + - name: AZURE_CLIENT_ID + value: 1ef7e168-08e1-4798-b29b-7f5f9bd048ea + - name: AZURE_OBJECT_ID + value: 84f54dea-e1dc-42a6-8428-a5ec93cd1699 + - name: KEYVAULT_NAME + value: sgpaz98031021keyvault + - name: ABS_ACCOUNT_NAME + value: sgpaz98031021storage + - name: ABS_CONTAINER_NAME + value: sgpaz98031021models + - name: SERVICEBUS_NAMESPACE + value: sgpaz98031021llm-engine + - name: DD_VERSION + value: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + - name: GIT_TAG + value: 6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + - name: DD_SERVICE + value: model-engine + image: 022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984 + imagePullPolicy: IfNotPresent + name: model-engine + ports: + - containerPort: 5000 + name: http + protocol: TCP + readinessProbe: + failureThreshold: 30 + httpGet: + path: /readyz + port: 5000 + scheme: HTTP + periodSeconds: 2 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: "2" + ephemeral-storage: 256Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /workspace/model-engine/model_engine_server/infra/gateways/resources/templates + name: service-template-config + - mountPath: /workspace/model-engine/service_configs + name: model-engine-service-config-volume + - mountPath: /workspace/model-engine/model_engine_server/core/configs + name: infra-service-config-volume + dnsPolicy: ClusterFirst + imagePullSecrets: + - name: egp-ecr-regcred + nodeSelector: + node-lifecycle: normal + priorityClassName: model-engine-high-priority + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + serviceAccount: model-engine + serviceAccountName: model-engine + terminationGracePeriodSeconds: 30 + volumes: + - emptyDir: + medium: Memory + name: dshm + - configMap: + defaultMode: 420 + name: model-engine-service-template-config + name: service-template-config + - configMap: + defaultMode: 420 + items: + - key: launch_service_config + path: service_config.yaml + name: model-engine-service-config + name: model-engine-service-config-volume + - configMap: + defaultMode: 420 + items: + - key: infra_service_config + path: config.yaml + name: model-engine-service-config + name: infra-service-config-volume +status: + availableReplicas: 2 + conditions: + - lastTransitionTime: "2025-03-10T20:54:36Z" + lastUpdateTime: "2025-03-10T20:55:26Z" + message: ReplicaSet "model-engine-f68b7c769" has successfully progressed. + reason: NewReplicaSetAvailable + status: "True" + type: Progressing + - lastTransitionTime: "2025-09-24T16:13:58Z" + lastUpdateTime: "2025-09-24T16:13:58Z" + message: Deployment has minimum availability. + reason: MinimumReplicasAvailable + status: "True" + type: Available + observedGeneration: 6 + readyReplicas: 2 + replicas: 2 + updatedReplicas: 2 diff --git a/backups/quick-rollback.sh b/backups/quick-rollback.sh new file mode 100755 index 000000000..b0bb76368 --- /dev/null +++ b/backups/quick-rollback.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Quick Rollback Script +# This will rollback the Helm release to revision 1 + +set -e + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +echo -e "${RED}🚨 ROLLBACK INITIATED${NC}" +echo -e "${YELLOW}This will rollback launch-inference to revision 1${NC}" +echo + +read -p "Are you sure you want to rollback? (yes/no) " -r +if [[ ! $REPLY =~ ^yes$ ]]; then + echo -e "${RED}Rollback cancelled${NC}" + exit 1 +fi + +echo -e "${BLUE}1. Rolling back Helm release...${NC}" +helm rollback launch-inference 1 -n launch + +echo -e "${BLUE}2. Monitoring model-engine deployment...${NC}" +kubectl rollout status deployment/model-engine -n launch --timeout=300s + +echo -e "${BLUE}3. Monitoring endpoint-builder deployment...${NC}" +kubectl rollout status deployment/model-engine-endpoint-builder -n launch --timeout=300s + +echo -e "${GREEN}✅ Rollback completed!${NC}" +echo + +echo -e "${BLUE}Verifying pods...${NC}" +kubectl get pods -n launch | grep model-engine + +echo +echo -e "${BLUE}Verifying image...${NC}" +CURRENT_IMAGE=$(kubectl get deployment model-engine -n launch -o jsonpath='{.spec.template.spec.containers[0].image}') +EXPECTED_IMAGE="022465994601.dkr.ecr.us-west-2.amazonaws.com/egp-mirror-int/model-engine:6e35c71cf82622fe2ad6e745728a65a1ff6f3984" + +if [ "$CURRENT_IMAGE" = "$EXPECTED_IMAGE" ]; then + echo -e "${GREEN}✅ Image verified: $CURRENT_IMAGE${NC}" +else + echo -e "${RED}❌ Image mismatch!${NC}" + echo -e " Expected: $EXPECTED_IMAGE" + echo -e " Got: $CURRENT_IMAGE" +fi + +echo +echo -e "${GREEN}🎉 Rollback successful!${NC}"