1111from typing import Dict , Optional , Set
1212from dataclasses import dataclass , field
1313from enum import Enum
14- import threading
1514from datetime import datetime , timedelta
1615
1716import aiofiles
@@ -63,14 +62,31 @@ def resolve_with_system_dns(hostname):
6362
6463PLAYWRIGHT_WS_URL = f"ws://127.0.0.1:3000/"
6564
65+ # --- CUSTOM EXCEPTIONS ---
66+
67+ class KernelError (Exception ):
68+ """Base exception for kernel-related errors"""
69+ pass
70+
71+ class NoKernelAvailableError (KernelError ):
72+ """Raised when no kernels are available in the pool"""
73+ pass
74+
75+ class KernelExecutionError (KernelError ):
76+ """Raised when kernel execution fails"""
77+ pass
78+
79+ class KernelTimeoutError (KernelError ):
80+ """Raised when kernel operation times out"""
81+ pass
82+
6683# --- KERNEL MANAGEMENT CLASSES ---
6784
6885class KernelState (Enum ):
6986 HEALTHY = "healthy"
7087 BUSY = "busy"
7188 UNRESPONSIVE = "unresponsive"
7289 FAILED = "failed"
73- RESTARTING = "restarting"
7490
7591@dataclass
7692class KernelInfo :
@@ -80,14 +96,10 @@ class KernelInfo:
8096 last_health_check : datetime = field (default_factory = datetime .now )
8197 current_operation : Optional [str ] = None
8298 failure_count : int = 0
83- websocket : Optional [object ] = None
8499
85100 def is_available (self ) -> bool :
86101 return self .state == KernelState .HEALTHY
87102
88- def is_stale (self ) -> bool :
89- return datetime .now () - self .last_used > timedelta (seconds = KERNEL_TIMEOUT )
90-
91103 def needs_health_check (self ) -> bool :
92104 return datetime .now () - self .last_health_check > timedelta (seconds = KERNEL_HEALTH_CHECK_INTERVAL )
93105
@@ -184,13 +196,15 @@ async def release_kernel(self, kernel_id: str, failed: bool = False):
184196 async def _get_existing_kernel (self ) -> Optional [str ]:
185197 """Try to get kernel ID from existing file"""
186198 try :
187- if os .path .exists (KERNEL_ID_FILE_PATH ):
188- with open (KERNEL_ID_FILE_PATH , 'r' ) as file :
189- kernel_id = file .read ().strip ()
190- if kernel_id and await self ._check_kernel_health (kernel_id ):
191- return kernel_id
199+ async with aiofiles .open (KERNEL_ID_FILE_PATH , mode = 'r' ) as f :
200+ kernel_id = (await f .read ()).strip ()
201+ if kernel_id and await self ._check_kernel_health (kernel_id ):
202+ return kernel_id
203+ except FileNotFoundError :
204+ # This is a normal case if the server is starting for the first time.
205+ pass
192206 except Exception as e :
193- logger .warning (f"Could not read existing kernel: { e } " )
207+ logger .warning (f"Could not read or validate existing kernel from { KERNEL_ID_FILE_PATH } : { e } " )
194208 return None
195209
196210 async def _create_new_kernel (self ) -> Optional [str ]:
@@ -338,7 +352,7 @@ async def execute_with_retry(command: str, ctx: Context, max_attempts: int = MAX
338352 # Get kernel from pool
339353 kernel_id = await kernel_pool .get_available_kernel ()
340354 if not kernel_id :
341- raise Exception ("No available kernels in pool" )
355+ raise NoKernelAvailableError ("No available kernels in pool" )
342356
343357 try :
344358 result = await _execute_on_kernel (kernel_id , command , ctx )
@@ -431,7 +445,7 @@ async def _execute_on_kernel(kernel_id: str, command: str, ctx: Context) -> str:
431445 elif msg_type == "error" :
432446 error_traceback = "\n " .join (content .get ("traceback" , []))
433447 logger .error (f"Execution error on kernel { kernel_id } for msg_id { sent_msg_id } :\n { error_traceback } " )
434- raise Exception (f"Execution Error:\n { error_traceback } " )
448+ raise KernelExecutionError (f"Execution Error:\n { error_traceback } " )
435449
436450 elif msg_type == "status" and content .get ("execution_state" ) == "idle" :
437451 execution_complete = True
@@ -441,18 +455,18 @@ async def _execute_on_kernel(kernel_id: str, command: str, ctx: Context) -> str:
441455 elapsed = time .time () - start_time
442456 timeout_msg = f"Execution timed out after { elapsed :.0f} seconds on kernel { kernel_id } "
443457 logger .error (f"Execution timed out for msg_id: { sent_msg_id } " )
444- raise Exception (timeout_msg )
458+ raise KernelTimeoutError (timeout_msg )
445459
446460 return "" .join (final_output_lines ) if final_output_lines else "[Execution successful with no output]"
447461
448462 except websockets .exceptions .ConnectionClosed as e :
449463 error_msg = f"WebSocket connection to kernel { kernel_id } closed unexpectedly: { e } "
450464 logger .error (error_msg )
451- raise Exception (error_msg )
465+ raise KernelError (error_msg )
452466 except websockets .exceptions .WebSocketException as e :
453467 error_msg = f"WebSocket error with kernel { kernel_id } : { e } "
454468 logger .error (error_msg )
455- raise Exception (error_msg )
469+ raise KernelError (error_msg )
456470 except Exception as e :
457471 logger .error (f"Unexpected error during execution on kernel { kernel_id } : { e } " , exc_info = True )
458472 raise e
0 commit comments