From 3ee6e90bdd9f6ab210d3ae34b399bd8d25acbd1f Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 03:31:07 -0500 Subject: [PATCH 01/11] divertsity --- codeflash/code_utils/config_consts.py | 28 ++++++++++++++++++++ codeflash/models/models.py | 1 + codeflash/optimization/function_optimizer.py | 27 +++++++++++-------- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 88758455e..aa31d8063 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -32,6 +32,32 @@ MAX_N_CANDIDATES = 5 MAX_N_CANDIDATES_LP = 6 +# Multi-model diversity configuration +# Each tuple is (model_name, num_calls) where each call returns 1 candidate +# Standard mode: 3 GPT-4.1 + 2 Claude Sonnet = 5 candidates +MODEL_DISTRIBUTION: list[tuple[str, int]] = [ + ("gpt-4.1", 3), + ("claude-sonnet-4-5", 2), +] + +# LSP mode: fewer candidates for faster response +MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [ + ("gpt-4.1", 2), + ("claude-sonnet-4-5", 1), +] + +# Line profiler mode: 6 candidates total +MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [ + ("gpt-4.1", 4), + ("claude-sonnet-4-5", 2), +] + +# Line profiler LSP mode +MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [ + ("gpt-4.1", 2), + ("claude-sonnet-4-5", 1), +] + try: from codeflash.lsp.helpers import is_LSP_enabled @@ -43,5 +69,7 @@ N_CANDIDATES_LP_EFFECTIVE = min(N_CANDIDATES_LP_LSP if _IS_LSP_ENABLED else N_CANDIDATES_LP, MAX_N_CANDIDATES_LP) N_TESTS_TO_GENERATE_EFFECTIVE = N_TESTS_TO_GENERATE_LSP if _IS_LSP_ENABLED else N_TESTS_TO_GENERATE TOTAL_LOOPING_TIME_EFFECTIVE = TOTAL_LOOPING_TIME_LSP if _IS_LSP_ENABLED else TOTAL_LOOPING_TIME +MODEL_DISTRIBUTION_EFFECTIVE = MODEL_DISTRIBUTION_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION +MODEL_DISTRIBUTION_LP_EFFECTIVE = MODEL_DISTRIBUTION_LP_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION_LP MAX_CONTEXT_LEN_REVIEW = 1000 diff --git a/codeflash/models/models.py b/codeflash/models/models.py index 1db09bc12..4f7553818 100644 --- a/codeflash/models/models.py +++ b/codeflash/models/models.py @@ -464,6 +464,7 @@ class OptimizedCandidate: optimization_id: str source: OptimizedCandidateSource parent_id: str | None = None + model: str | None = None # Which LLM model generated this candidate @dataclass(frozen=True) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 416bdc8df..8776d9c58 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -46,6 +46,8 @@ COVERAGE_THRESHOLD, INDIVIDUAL_TESTCASE_TIMEOUT, MAX_REPAIRS_PER_TRACE, + MODEL_DISTRIBUTION_EFFECTIVE, + MODEL_DISTRIBUTION_LP_EFFECTIVE, N_CANDIDATES_EFFECTIVE, N_CANDIDATES_LP_EFFECTIVE, N_TESTS_TO_GENERATE_EFFECTIVE, @@ -921,18 +923,20 @@ def determine_best_candidate( ai_service_client = self.aiservice_client if exp_type == "EXP0" else self.local_aiservice_client assert ai_service_client is not None, "AI service client must be set for optimization" + # Use multi-model approach for line profiler optimization future_line_profile_results = self.executor.submit( - ai_service_client.optimize_python_code_line_profiler, + ai_service_client.optimize_python_code_line_profiler_multi_model, source_code=code_context.read_writable_code.markdown, dependency_code=code_context.read_only_context_code, - trace_id=self.get_trace_id(exp_type), + base_trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], - num_candidates=N_CANDIDATES_LP_EFFECTIVE, + model_distribution=MODEL_DISTRIBUTION_LP_EFFECTIVE, experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) if self.experiment_id else None, + executor=self.executor, ) processor = CandidateProcessor( @@ -1353,17 +1357,17 @@ def generate_optimizations( read_only_context_code: str, run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: - """Generate optimization candidates for the function.""" - n_candidates = N_CANDIDATES_EFFECTIVE - + """Generate optimization candidates for the function using multiple models in parallel.""" + # Use multi-model approach for diversity future_optimization_candidates = self.executor.submit( - self.aiservice_client.optimize_python_code, + self.aiservice_client.optimize_python_code_multi_model, read_writable_code.markdown, read_only_context_code, self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id, - n_candidates, + MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, + executor=self.executor, ) future_references = self.executor.submit( @@ -1380,13 +1384,14 @@ def generate_optimizations( if run_experiment: future_candidates_exp = self.executor.submit( - self.local_aiservice_client.optimize_python_code, + self.local_aiservice_client.optimize_python_code_multi_model, read_writable_code.markdown, read_only_context_code, self.function_trace_id[:-4] + "EXP1", - n_candidates, + MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, + executor=self.executor, ) futures.append(future_candidates_exp) @@ -1395,7 +1400,7 @@ def generate_optimizations( # Retrieve results candidates: list[OptimizedCandidate] = future_optimization_candidates.result() - logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations.") + logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations from multiple models.") if not candidates: return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}") From 35ae79e8e6f778440bae4bdae5f87c863cdf6acd Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 03:42:32 -0500 Subject: [PATCH 02/11] add diversity --- codeflash/api/aiservice.py | 110 ++++++++++++++++++- codeflash/code_utils/config_consts.py | 20 +--- codeflash/optimization/function_optimizer.py | 5 - 3 files changed, 108 insertions(+), 27 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 86fb125b7..78d042791 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -1,5 +1,6 @@ from __future__ import annotations +import concurrent.futures import json import os import platform @@ -12,7 +13,6 @@ from codeflash.cli_cmds.console import console, logger from codeflash.code_utils.code_replacer import is_zero_diff from codeflash.code_utils.code_utils import unified_diff_strings -from codeflash.code_utils.config_consts import N_CANDIDATES_EFFECTIVE, N_CANDIDATES_LP_EFFECTIVE from codeflash.code_utils.env_utils import get_codeflash_api_key from codeflash.code_utils.git_utils import get_last_commit_author_if_pr_exists, get_repo_owner_and_name from codeflash.code_utils.time_utils import humanize_runtime @@ -35,6 +35,8 @@ from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest from codeflash.result.explanation import Explanation +multi_model_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10, thread_name_prefix="multi_model") + class AiServiceClient: def __init__(self) -> None: @@ -92,7 +94,7 @@ def make_ai_service_request( return response def _get_valid_candidates( - self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource + self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource, model: str | None = None ) -> list[OptimizedCandidate]: candidates: list[OptimizedCandidate] = [] for opt in optimizations_json: @@ -106,6 +108,7 @@ def _get_valid_candidates( optimization_id=opt["optimization_id"], source=source, parent_id=opt.get("parent_id", None), + model=model, ) ) return candidates @@ -119,6 +122,7 @@ def optimize_python_code( # noqa: D417 experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, + model: str | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -129,6 +133,7 @@ def optimize_python_code( # noqa: D417 - trace_id (str): Trace id of optimization run - num_candidates (int): Number of optimization variants to generate. Default is 10. - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization + - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). Returns ------- @@ -149,8 +154,9 @@ def optimize_python_code( # noqa: D417 "current_username": get_last_commit_author_if_pr_exists(None), "repo_owner": git_repo_owner, "repo_name": git_repo_name, - "n_candidates": N_CANDIDATES_EFFECTIVE, + "n_candidates": num_candidates, "is_async": is_async, + "model": model, } logger.info("!lsp|Generating optimized candidates…") @@ -167,7 +173,7 @@ def optimize_python_code( # noqa: D417 console.rule() end_time = time.perf_counter() logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.") - return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE) + return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE, model=model) try: error = response.json()["error"] except Exception: @@ -185,6 +191,7 @@ def optimize_python_code_line_profiler( # noqa: D417 line_profiler_results: str, num_candidates: int = 10, experiment_metadata: ExperimentMetadata | None = None, + model: str | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -195,6 +202,7 @@ def optimize_python_code_line_profiler( # noqa: D417 - trace_id (str): Trace id of optimization run - num_candidates (int): Number of optimization variants to generate. Default is 10. - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization + - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). Returns ------- @@ -211,7 +219,8 @@ def optimize_python_code_line_profiler( # noqa: D417 "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, "lsp_mode": is_LSP_enabled(), - "n_candidates_lp": N_CANDIDATES_LP_EFFECTIVE, + "n_candidates_lp": num_candidates, + "model": model, } console.rule() @@ -232,7 +241,7 @@ def optimize_python_code_line_profiler( # noqa: D417 f"!lsp|Generated {len(optimizations_json)} candidate optimizations using line profiler information." ) console.rule() - return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP) + return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP, model=model) try: error = response.json()["error"] except Exception: @@ -242,6 +251,95 @@ def optimize_python_code_line_profiler( # noqa: D417 console.rule() return [] + def optimize_python_code_multi_model( + self, + source_code: str, + dependency_code: str, + base_trace_id: str, + model_distribution: list[tuple[str, int]], + experiment_metadata: ExperimentMetadata | None = None, + *, + is_async: bool = False, + ) -> list[OptimizedCandidate]: + """Generate optimizations using multiple models in parallel.""" + futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] + call_index = 0 + + for model_name, num_calls in model_distribution: + for _ in range(num_calls): + call_trace_id = f"{base_trace_id[:-4]}M{call_index:02d}" + call_index += 1 + + future = multi_model_executor.submit( + self.optimize_python_code, + source_code, + dependency_code, + call_trace_id, + num_candidates=1, # Each call returns 1 candidate + experiment_metadata=experiment_metadata, + is_async=is_async, + model=model_name, + ) + futures.append((future, model_name)) + + # Wait for all calls to complete + concurrent.futures.wait([f for f, _ in futures]) + + # Collect results + all_candidates: list[OptimizedCandidate] = [] + for future, model_name in futures: + try: + candidates = future.result() + all_candidates.extend(candidates) + except Exception as e: + logger.warning(f"Model {model_name} call failed: {e}") + continue + + return all_candidates + + def optimize_python_code_line_profiler_multi_model( + self, + source_code: str, + dependency_code: str, + base_trace_id: str, + line_profiler_results: str, + model_distribution: list[tuple[str, int]], + experiment_metadata: ExperimentMetadata | None = None, + ) -> list[OptimizedCandidate]: + """Generate line profiler optimizations using multiple models in parallel.""" + futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] + call_index = 0 + + for model_name, num_calls in model_distribution: + for _ in range(num_calls): + call_trace_id = f"{base_trace_id[:-4]}L{call_index:02d}" + call_index += 1 + + future = multi_model_executor.submit( + self.optimize_python_code_line_profiler, + source_code, + dependency_code, + call_trace_id, + line_profiler_results, + num_candidates=1, + experiment_metadata=experiment_metadata, + model=model_name, + ) + futures.append((future, model_name)) + + concurrent.futures.wait([f for f, _ in futures]) + + all_candidates: list[OptimizedCandidate] = [] + for future, model_name in futures: + try: + candidates = future.result() + all_candidates.extend(candidates) + except Exception as e: + logger.warning(f"Line profiler model {model_name} call failed: {e}") + continue + + return all_candidates + def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index aa31d8063..ba09989f8 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -35,28 +35,16 @@ # Multi-model diversity configuration # Each tuple is (model_name, num_calls) where each call returns 1 candidate # Standard mode: 3 GPT-4.1 + 2 Claude Sonnet = 5 candidates -MODEL_DISTRIBUTION: list[tuple[str, int]] = [ - ("gpt-4.1", 3), - ("claude-sonnet-4-5", 2), -] +MODEL_DISTRIBUTION: list[tuple[str, int]] = [("gpt-4.1", 3), ("claude-sonnet-4-5", 2)] # LSP mode: fewer candidates for faster response -MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [ - ("gpt-4.1", 2), - ("claude-sonnet-4-5", 1), -] +MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)] # Line profiler mode: 6 candidates total -MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [ - ("gpt-4.1", 4), - ("claude-sonnet-4-5", 2), -] +MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [("gpt-4.1", 4), ("claude-sonnet-4-5", 2)] # Line profiler LSP mode -MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [ - ("gpt-4.1", 2), - ("claude-sonnet-4-5", 1), -] +MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)] try: from codeflash.lsp.helpers import is_LSP_enabled diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 8776d9c58..afd56519e 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -48,8 +48,6 @@ MAX_REPAIRS_PER_TRACE, MODEL_DISTRIBUTION_EFFECTIVE, MODEL_DISTRIBUTION_LP_EFFECTIVE, - N_CANDIDATES_EFFECTIVE, - N_CANDIDATES_LP_EFFECTIVE, N_TESTS_TO_GENERATE_EFFECTIVE, REFINE_ALL_THRESHOLD, REFINED_CANDIDATE_RANKING_WEIGHTS, @@ -936,7 +934,6 @@ def determine_best_candidate( ) if self.experiment_id else None, - executor=self.executor, ) processor = CandidateProcessor( @@ -1367,7 +1364,6 @@ def generate_optimizations( MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, - executor=self.executor, ) future_references = self.executor.submit( @@ -1391,7 +1387,6 @@ def generate_optimizations( MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, - executor=self.executor, ) futures.append(future_candidates_exp) From cdf85d2c8be74d37a2352b33906b564cfd7fc123 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 03:56:02 -0500 Subject: [PATCH 03/11] implement trace_id observability --- codeflash/api/aiservice.py | 77 +++++++++++--------- codeflash/models/models.py | 1 + codeflash/optimization/function_optimizer.py | 72 +++++++++++++++--- codeflash/verification/verifier.py | 2 + 4 files changed, 105 insertions(+), 47 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 78d042791..4dca8096c 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -118,11 +118,11 @@ def optimize_python_code( # noqa: D417 source_code: str, dependency_code: str, trace_id: str, - num_candidates: int = 10, experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, model: str | None = None, + call_sequence: int | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -131,9 +131,9 @@ def optimize_python_code( # noqa: D417 - source_code (str): The python code to optimize. - dependency_code (str): The dependency code used as read-only context for the optimization - trace_id (str): Trace id of optimization run - - num_candidates (int): Number of optimization variants to generate. Default is 10. - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). + - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None. Returns ------- @@ -146,7 +146,6 @@ def optimize_python_code( # noqa: D417 payload = { "source_code": source_code, "dependency_code": dependency_code, - "num_variants": num_candidates, "trace_id": trace_id, "python_version": platform.python_version(), "experiment_metadata": experiment_metadata, @@ -154,13 +153,12 @@ def optimize_python_code( # noqa: D417 "current_username": get_last_commit_author_if_pr_exists(None), "repo_owner": git_repo_owner, "repo_name": git_repo_name, - "n_candidates": num_candidates, "is_async": is_async, "model": model, + "call_sequence": call_sequence, } + logger.debug(f"Sending optimize request: model={model}, trace_id={trace_id}, call_sequence={call_sequence}") - logger.info("!lsp|Generating optimized candidates…") - console.rule() try: response = self.make_ai_service_request("/optimize", payload=payload, timeout=60) except requests.exceptions.RequestException as e: @@ -170,9 +168,9 @@ def optimize_python_code( # noqa: D417 if response.status_code == 200: optimizations_json = response.json()["optimizations"] - console.rule() end_time = time.perf_counter() logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.") + logger.debug(f"Backend returned {len(optimizations_json)} optimization(s)") return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE, model=model) try: error = response.json()["error"] @@ -180,7 +178,6 @@ def optimize_python_code( # noqa: D417 error = response.text logger.error(f"Error generating optimized candidates: {response.status_code} - {error}") ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error}) - console.rule() return [] def optimize_python_code_line_profiler( # noqa: D417 @@ -189,9 +186,9 @@ def optimize_python_code_line_profiler( # noqa: D417 dependency_code: str, trace_id: str, line_profiler_results: str, - num_candidates: int = 10, experiment_metadata: ExperimentMetadata | None = None, model: str | None = None, + call_sequence: int | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -200,9 +197,9 @@ def optimize_python_code_line_profiler( # noqa: D417 - source_code (str): The python code to optimize. - dependency_code (str): The dependency code used as read-only context for the optimization - trace_id (str): Trace id of optimization run - - num_candidates (int): Number of optimization variants to generate. Default is 10. - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). + - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None. Returns ------- @@ -212,21 +209,18 @@ def optimize_python_code_line_profiler( # noqa: D417 payload = { "source_code": source_code, "dependency_code": dependency_code, - "num_variants": num_candidates, "line_profiler_results": line_profiler_results, "trace_id": trace_id, "python_version": platform.python_version(), "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, "lsp_mode": is_LSP_enabled(), - "n_candidates_lp": num_candidates, "model": model, + "call_sequence": call_sequence, } - console.rule() if line_profiler_results == "": logger.info("No LineProfiler results were provided, Skipping optimization.") - console.rule() return [] try: response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=60) @@ -237,10 +231,7 @@ def optimize_python_code_line_profiler( # noqa: D417 if response.status_code == 200: optimizations_json = response.json()["optimizations"] - logger.info( - f"!lsp|Generated {len(optimizations_json)} candidate optimizations using line profiler information." - ) - console.rule() + logger.debug(f"Backend returned {len(optimizations_json)} LP optimization(s)") return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP, model=model) try: error = response.json()["error"] @@ -248,7 +239,6 @@ def optimize_python_code_line_profiler( # noqa: D417 error = response.text logger.error(f"Error generating optimized candidates: {response.status_code} - {error}") ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error}) - console.rule() return [] def optimize_python_code_multi_model( @@ -260,32 +250,34 @@ def optimize_python_code_multi_model( experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, - ) -> list[OptimizedCandidate]: + sequence_offset: int = 0, + ) -> tuple[list[OptimizedCandidate], int]: """Generate optimizations using multiple models in parallel.""" + logger.info("Generating optimized candidates…") + console.rule() + futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] - call_index = 0 + call_index = 0 for model_name, num_calls in model_distribution: for _ in range(num_calls): - call_trace_id = f"{base_trace_id[:-4]}M{call_index:02d}" + call_trace_id = f"{base_trace_id[:-3]}0{call_index:02x}" + call_sequence = sequence_offset + call_index + 1 call_index += 1 - future = multi_model_executor.submit( self.optimize_python_code, source_code, dependency_code, call_trace_id, - num_candidates=1, # Each call returns 1 candidate - experiment_metadata=experiment_metadata, + experiment_metadata, is_async=is_async, model=model_name, + call_sequence=call_sequence, ) futures.append((future, model_name)) - # Wait for all calls to complete concurrent.futures.wait([f for f, _ in futures]) - # Collect results all_candidates: list[OptimizedCandidate] = [] for future, model_name in futures: try: @@ -295,7 +287,8 @@ def optimize_python_code_multi_model( logger.warning(f"Model {model_name} call failed: {e}") continue - return all_candidates + console.rule() + return all_candidates, call_index def optimize_python_code_line_profiler_multi_model( self, @@ -305,25 +298,29 @@ def optimize_python_code_line_profiler_multi_model( line_profiler_results: str, model_distribution: list[tuple[str, int]], experiment_metadata: ExperimentMetadata | None = None, - ) -> list[OptimizedCandidate]: + sequence_offset: int = 0, + ) -> tuple[list[OptimizedCandidate], int]: """Generate line profiler optimizations using multiple models in parallel.""" + logger.info("Generating optimized candidates with line profiler…") + console.rule() + futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] - call_index = 0 + call_index = 0 for model_name, num_calls in model_distribution: for _ in range(num_calls): - call_trace_id = f"{base_trace_id[:-4]}L{call_index:02d}" + call_trace_id = f"{base_trace_id[:-3]}1{call_index:02x}" + call_sequence = sequence_offset + call_index + 1 call_index += 1 - future = multi_model_executor.submit( self.optimize_python_code_line_profiler, source_code, dependency_code, call_trace_id, line_profiler_results, - num_candidates=1, - experiment_metadata=experiment_metadata, - model=model_name, + experiment_metadata, + model_name, + call_sequence, ) futures.append((future, model_name)) @@ -338,7 +335,8 @@ def optimize_python_code_line_profiler_multi_model( logger.warning(f"Line profiler model {model_name} call failed: {e}") continue - return all_candidates + console.rule() + return all_candidates, call_index def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -366,6 +364,7 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest] "trace_id": opt.trace_id, "function_references": opt.function_references, "python_version": platform.python_version(), + "call_sequence": opt.call_sequence, } for opt in request ] @@ -455,6 +454,7 @@ def get_new_explanation( # noqa: D417 throughput_improvement: str | None = None, function_references: str | None = None, codeflash_version: str = codeflash_version, + call_sequence: int | None = None, ) -> str: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -500,6 +500,7 @@ def get_new_explanation( # noqa: D417 "throughput_improvement": throughput_improvement, "function_references": function_references, "codeflash_version": codeflash_version, + "call_sequence": call_sequence, } logger.info("loading|Generating explanation") console.rule() @@ -627,6 +628,7 @@ def generate_regression_tests( # noqa: D417 test_timeout: int, trace_id: str, test_index: int, + call_sequence: int | None = None, ) -> tuple[str, str, str] | None: """Generate regression tests for the given function by making a request to the Django endpoint. @@ -662,6 +664,7 @@ def generate_regression_tests( # noqa: D417 "python_version": platform.python_version(), "codeflash_version": codeflash_version, "is_async": function_to_optimize.is_async, + "call_sequence": call_sequence, } try: response = self.make_ai_service_request("/testgen", payload=payload, timeout=90) @@ -702,6 +705,7 @@ def get_optimization_review( replay_tests: str, concolic_tests: str, # noqa: ARG002 calling_fn_details: str, + call_sequence: int | None = None, ) -> str: """Compute the optimization review of current Pull Request. @@ -748,6 +752,7 @@ def get_optimization_review( "codeflash_version": codeflash_version, "calling_fn_details": calling_fn_details, "python_version": platform.python_version(), + "call_sequence": call_sequence, } console.rule() try: diff --git a/codeflash/models/models.py b/codeflash/models/models.py index 4f7553818..822ecffab 100644 --- a/codeflash/models/models.py +++ b/codeflash/models/models.py @@ -46,6 +46,7 @@ class AIServiceRefinerRequest: original_line_profiler_results: str optimized_line_profiler_results: str function_references: str | None = None + call_sequence: int | None = None class TestDiffScope(str, Enum): diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index afd56519e..e8e51deb7 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -139,6 +139,7 @@ def __init__( ai_service_client: AiServiceClient, executor: concurrent.futures.ThreadPoolExecutor, future_all_code_repair: list[concurrent.futures.Future], + sequence_offset: int = 0, ) -> None: self.candidate_queue = queue.Queue() self.line_profiler_done = False @@ -146,6 +147,9 @@ def __init__( self.candidate_len = len(initial_candidates) self.ai_service_client = ai_service_client self.executor = executor + self.sequence_offset = sequence_offset + self.lp_calls_count = 0 + self.refinement_calls_count = 0 # Initialize queue with initial candidates for candidate in initial_candidates: @@ -155,6 +159,9 @@ def __init__( self.all_refinements_data = all_refinements_data self.future_all_code_repair = future_all_code_repair + def get_total_llm_calls(self) -> int: + return self.sequence_offset + self.lp_calls_count + self.refinement_calls_count + def get_next_candidate(self) -> OptimizedCandidate | None: """Get the next candidate from the queue, handling async results as needed.""" try: @@ -176,7 +183,11 @@ def _process_line_profiler_results(self) -> OptimizedCandidate | None: """Process line profiler results and add to queue.""" logger.debug("all candidates processed, await candidates from line profiler") concurrent.futures.wait([self.future_line_profile_results]) - line_profile_results = self.future_line_profile_results.result() + result = self.future_line_profile_results.result() + + # LP multi-model now returns (candidates, lp_call_count) + line_profile_results, lp_call_count = result + self.lp_calls_count = lp_call_count for candidate in line_profile_results: self.candidate_queue.put(candidate) @@ -192,11 +203,18 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur def _process_refinement_results(self) -> OptimizedCandidate | None: """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined.""" + import dataclasses # noqa: PLC0415 + future_refinements: list[concurrent.futures.Future] = [] + # Calculate base sequence: offset + lp_calls (refinements come after LP) + base_sequence = self.sequence_offset + self.lp_calls_count + refinement_call_index = 0 if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD: for data in self.all_refinements_data: - future_refinements.append(self.refine_optimizations([data])) # noqa: PERF401 + refinement_call_index += 1 + data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index) + future_refinements.append(self.refine_optimizations([data_with_seq])) else: diff_lens_list = [] runtimes_list = [] @@ -215,8 +233,13 @@ def _process_refinement_results(self) -> OptimizedCandidate | None: top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates] for idx in top_indecies: + refinement_call_index += 1 data = self.all_refinements_data[idx] - future_refinements.append(self.refine_optimizations([data])) + data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index) + future_refinements.append(self.refine_optimizations([data_with_seq])) + + # Track total refinement calls made + self.refinement_calls_count = refinement_call_index if future_refinements: logger.info("loading|Refining generated code for improved quality and performance...") @@ -319,10 +342,14 @@ def __init__( self.optimization_review = "" self.future_all_code_repair: list[concurrent.futures.Future] = [] self.repair_counter = 0 # track how many repairs we did for each function + self.test_gen_calls_count = 0 + self.optimize_calls_count = 0 + self.lp_calls_count = 0 + self.total_llm_calls = 0 def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]: should_run_experiment = self.experiment_id is not None - logger.debug(f"Function Trace ID: {self.function_trace_id}") + logger.info(f"Function Trace ID: {self.function_trace_id}") ph("cli-optimize-function-start", {"function_trace_id": self.function_trace_id}) self.cleanup_leftover_test_return_values() file_name_from_test_module_name.cache_clear() @@ -921,7 +948,6 @@ def determine_best_candidate( ai_service_client = self.aiservice_client if exp_type == "EXP0" else self.local_aiservice_client assert ai_service_client is not None, "AI service client must be set for optimization" - # Use multi-model approach for line profiler optimization future_line_profile_results = self.executor.submit( ai_service_client.optimize_python_code_line_profiler_multi_model, source_code=code_context.read_writable_code.markdown, @@ -934,6 +960,7 @@ def determine_best_candidate( ) if self.experiment_id else None, + sequence_offset=self.optimize_calls_count, ) processor = CandidateProcessor( @@ -943,6 +970,7 @@ def determine_best_candidate( self.aiservice_client, self.executor, self.future_all_code_repair, + sequence_offset=self.optimize_calls_count, ) candidate_index = 0 @@ -976,6 +1004,9 @@ def determine_best_candidate( self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path ) + # Track total LLM calls from the processor for sequence numbering + self.total_llm_calls = processor.get_total_llm_calls() + # Select and return the best optimization best_optimization = self.select_best_optimization( eval_ctx=eval_ctx, @@ -1355,7 +1386,6 @@ def generate_optimizations( run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: """Generate optimization candidates for the function using multiple models in parallel.""" - # Use multi-model approach for diversity future_optimization_candidates = self.executor.submit( self.aiservice_client.optimize_python_code_multi_model, read_writable_code.markdown, @@ -1364,6 +1394,7 @@ def generate_optimizations( MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, + sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, ) future_references = self.executor.submit( @@ -1387,20 +1418,26 @@ def generate_optimizations( MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, + sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, ) futures.append(future_candidates_exp) # Wait for optimization futures to complete concurrent.futures.wait(futures) - # Retrieve results - candidates: list[OptimizedCandidate] = future_optimization_candidates.result() - logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations from multiple models.") + # Retrieve results - optimize_python_code_multi_model returns (candidates, call_count) + candidates, optimize_call_count = future_optimization_candidates.result() + # Total sequence count = test gen calls + optimization calls (LP will continue from here) + self.optimize_calls_count = N_TESTS_TO_GENERATE_EFFECTIVE + optimize_call_count + logger.info(f"!lsp|Completed {optimize_call_count} optimization calls, got {len(candidates)} candidates.") if not candidates: return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}") - candidates_experiment = future_candidates_exp.result() if future_candidates_exp else None + # Handle experiment results - also returns (candidates, call_count) tuple + candidates_experiment = None + if future_candidates_exp: + candidates_experiment, _ = future_candidates_exp.result() function_references = future_references.result() return Success((OptimizationSet(control=candidates, experiment=candidates_experiment), function_references)) @@ -1647,6 +1684,10 @@ def process_review( ) throughput_improvement_str = f"{throughput_improvement_value * 100:.1f}%" + # Explanation call continues the sequence numbering + explanation_call_sequence = self.total_llm_calls + 1 + self.total_llm_calls = explanation_call_sequence + new_explanation_raw_str = self.aiservice_client.get_new_explanation( source_code=code_context.read_writable_code.flat, dependency_code=code_context.read_only_context_code, @@ -1664,6 +1705,7 @@ def process_review( optimized_throughput=optimized_throughput_str, throughput_improvement=throughput_improvement_str, function_references=function_references, + call_sequence=explanation_call_sequence, ) new_explanation = Explanation( raw_explanation_message=new_explanation_raw_str or explanation.raw_explanation_message, @@ -1699,9 +1741,13 @@ def process_review( staging_review = self.args.staging_review opt_review_response = "" # this will now run regardless of pr, staging review flags + # Optimization review call continues the sequence numbering + review_call_sequence = self.total_llm_calls + 1 + self.total_llm_calls = review_call_sequence + try: opt_review_response = self.aiservice_client.get_optimization_review( - **data, calling_fn_details=function_references + **data, calling_fn_details=function_references, call_sequence=review_call_sequence ) except Exception as e: logger.debug(f"optimization review response failed, investigate {e}") @@ -2192,6 +2238,9 @@ def submit_test_generation_tasks( generated_test_paths: list[Path], generated_perf_test_paths: list[Path], ) -> list[concurrent.futures.Future]: + # Track how many test generation calls we're making for sequence numbering + self.test_gen_calls_count = len(generated_test_paths) + return [ executor.submit( generate_tests, @@ -2206,6 +2255,7 @@ def submit_test_generation_tasks( test_index, test_path, test_perf_path, + call_sequence=test_index + 1, ) for test_index, (test_path, test_perf_path) in enumerate( zip(generated_test_paths, generated_perf_test_paths) diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py index 8d187f2b1..d94455df3 100644 --- a/codeflash/verification/verifier.py +++ b/codeflash/verification/verifier.py @@ -27,6 +27,7 @@ def generate_tests( test_index: int, test_path: Path, test_perf_path: Path, + call_sequence: int | None = None, ) -> tuple[str, str, Path] | None: # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original # class import. Remove the recreation of the class definition @@ -42,6 +43,7 @@ def generate_tests( test_timeout=test_timeout, trace_id=function_trace_id, test_index=test_index, + call_sequence=call_sequence, ) if response and isinstance(response, tuple) and len(response) == 3: generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response From 5a122a99fb44f6b7a82c95afba9c5a8c716d0b04 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 18:34:37 -0500 Subject: [PATCH 04/11] pre-commit changes --- codeflash/api/aiservice.py | 1 + codeflash/optimization/function_optimizer.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index e233e6a71..fff3611fd 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -721,6 +721,7 @@ def get_optimization_review( root_dir: Path -> path of git directory concolic_tests: str -> concolic_tests (not used) calling_fn_details: str -> filenames and definitions of functions which call the function_to_optimize + call_sequence: int | None -> sequence number for multi-model calls Returns: ------- diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index e8e51deb7..dfafb86a5 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -203,7 +203,7 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur def _process_refinement_results(self) -> OptimizedCandidate | None: """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined.""" - import dataclasses # noqa: PLC0415 + import dataclasses future_refinements: list[concurrent.futures.Future] = [] # Calculate base sequence: offset + lp_calls (refinements come after LP) From 1b6e046553ba97c093a13ae59c862d000e685a13 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 18:56:10 -0500 Subject: [PATCH 05/11] have the client manage the TPE --- codeflash/api/aiservice.py | 9 ++++----- codeflash/optimization/function_optimizer.py | 3 +++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index fff3611fd..876ee74c5 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -35,9 +35,6 @@ from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest from codeflash.result.explanation import Explanation -multi_model_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10, thread_name_prefix="multi_model") - - class AiServiceClient: def __init__(self) -> None: self.base_url = self.get_aiservice_base_url() @@ -251,6 +248,7 @@ def optimize_python_code_multi_model( *, is_async: bool = False, sequence_offset: int = 0, + executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> tuple[list[OptimizedCandidate], int]: """Generate optimizations using multiple models in parallel.""" logger.info("Generating optimized candidates…") @@ -264,7 +262,7 @@ def optimize_python_code_multi_model( call_trace_id = f"{base_trace_id[:-3]}0{call_index:02x}" call_sequence = sequence_offset + call_index + 1 call_index += 1 - future = multi_model_executor.submit( + future = executor.submit( self.optimize_python_code, source_code, dependency_code, @@ -299,6 +297,7 @@ def optimize_python_code_line_profiler_multi_model( model_distribution: list[tuple[str, int]], experiment_metadata: ExperimentMetadata | None = None, sequence_offset: int = 0, + executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> tuple[list[OptimizedCandidate], int]: """Generate line profiler optimizations using multiple models in parallel.""" logger.info("Generating optimized candidates with line profiler…") @@ -312,7 +311,7 @@ def optimize_python_code_line_profiler_multi_model( call_trace_id = f"{base_trace_id[:-3]}1{call_index:02x}" call_sequence = sequence_offset + call_index + 1 call_index += 1 - future = multi_model_executor.submit( + future = executor.submit( self.optimize_python_code_line_profiler, source_code, dependency_code, diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index dfafb86a5..b29586b96 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -961,6 +961,7 @@ def determine_best_candidate( if self.experiment_id else None, sequence_offset=self.optimize_calls_count, + executor=self.executor, ) processor = CandidateProcessor( @@ -1395,6 +1396,7 @@ def generate_optimizations( ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, + executor=self.executor, ) future_references = self.executor.submit( @@ -1419,6 +1421,7 @@ def generate_optimizations( ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, + executor=self.executor, ) futures.append(future_candidates_exp) From 1c6e9513faa31df7ef57adcb0383fac03819019e Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 18:58:42 -0500 Subject: [PATCH 06/11] we should always have an executor --- codeflash/api/aiservice.py | 5 +++-- codeflash/optimization/function_optimizer.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 876ee74c5..7480252bd 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -35,6 +35,7 @@ from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest from codeflash.result.explanation import Explanation + class AiServiceClient: def __init__(self) -> None: self.base_url = self.get_aiservice_base_url() @@ -244,11 +245,11 @@ def optimize_python_code_multi_model( dependency_code: str, base_trace_id: str, model_distribution: list[tuple[str, int]], + executor: concurrent.futures.ThreadPoolExecutor, experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, sequence_offset: int = 0, - executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> tuple[list[OptimizedCandidate], int]: """Generate optimizations using multiple models in parallel.""" logger.info("Generating optimized candidates…") @@ -295,9 +296,9 @@ def optimize_python_code_line_profiler_multi_model( base_trace_id: str, line_profiler_results: str, model_distribution: list[tuple[str, int]], + executor: concurrent.futures.ThreadPoolExecutor, experiment_metadata: ExperimentMetadata | None = None, sequence_offset: int = 0, - executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> tuple[list[OptimizedCandidate], int]: """Generate line profiler optimizations using multiple models in parallel.""" logger.info("Generating optimized candidates with line profiler…") diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index b29586b96..6228ee01a 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -955,13 +955,13 @@ def determine_best_candidate( base_trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], model_distribution=MODEL_DISTRIBUTION_LP_EFFECTIVE, + executor=self.executor, experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) if self.experiment_id else None, sequence_offset=self.optimize_calls_count, - executor=self.executor, ) processor = CandidateProcessor( @@ -1393,10 +1393,10 @@ def generate_optimizations( read_only_context_code, self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id, MODEL_DISTRIBUTION_EFFECTIVE, + self.executor, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, - executor=self.executor, ) future_references = self.executor.submit( @@ -1418,10 +1418,10 @@ def generate_optimizations( read_only_context_code, self.function_trace_id[:-4] + "EXP1", MODEL_DISTRIBUTION_EFFECTIVE, + self.executor, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, - executor=self.executor, ) futures.append(future_candidates_exp) From bbb15e78379bf04a0cd22cff04f679d7e396584f Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Fri, 26 Dec 2025 11:57:30 -0500 Subject: [PATCH 07/11] do this elsewhere --- codeflash/api/aiservice.py | 153 ++++--------------- codeflash/code_utils/config_consts.py | 16 -- codeflash/optimization/function_optimizer.py | 79 +++------- 3 files changed, 50 insertions(+), 198 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 7480252bd..e858726b1 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -1,6 +1,5 @@ from __future__ import annotations -import concurrent.futures import json import os import platform @@ -92,7 +91,7 @@ def make_ai_service_request( return response def _get_valid_candidates( - self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource, model: str | None = None + self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource ) -> list[OptimizedCandidate]: candidates: list[OptimizedCandidate] = [] for opt in optimizations_json: @@ -106,7 +105,7 @@ def _get_valid_candidates( optimization_id=opt["optimization_id"], source=source, parent_id=opt.get("parent_id", None), - model=model, + model=opt.get("model"), ) ) return candidates @@ -119,8 +118,6 @@ def optimize_python_code( # noqa: D417 experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, - model: str | None = None, - call_sequence: int | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -130,14 +127,15 @@ def optimize_python_code( # noqa: D417 - dependency_code (str): The dependency code used as read-only context for the optimization - trace_id (str): Trace id of optimization run - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization - - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). - - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None. + - is_async (bool): Whether the function being optimized is async Returns ------- - List[OptimizationCandidate]: A list of Optimization Candidates. """ + logger.info("Generating optimized candidates…") + console.rule() start_time = time.perf_counter() git_repo_owner, git_repo_name = safe_get_repo_owner_and_name() @@ -152,30 +150,32 @@ def optimize_python_code( # noqa: D417 "repo_owner": git_repo_owner, "repo_name": git_repo_name, "is_async": is_async, - "model": model, - "call_sequence": call_sequence, + "lsp_mode": is_LSP_enabled(), } - logger.debug(f"Sending optimize request: model={model}, trace_id={trace_id}, call_sequence={call_sequence}") + logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}") try: - response = self.make_ai_service_request("/optimize", payload=payload, timeout=60) + response = self.make_ai_service_request("/optimize", payload=payload, timeout=120) except requests.exceptions.RequestException as e: logger.exception(f"Error generating optimized candidates: {e}") ph("cli-optimize-error-caught", {"error": str(e)}) + console.rule() return [] if response.status_code == 200: optimizations_json = response.json()["optimizations"] end_time = time.perf_counter() logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.") - logger.debug(f"Backend returned {len(optimizations_json)} optimization(s)") - return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE, model=model) + logger.info(f"!lsp|Received {len(optimizations_json)} optimization candidates.") + console.rule() + return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE) try: error = response.json()["error"] except Exception: error = response.text logger.error(f"Error generating optimized candidates: {response.status_code} - {error}") ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error}) + console.rule() return [] def optimize_python_code_line_profiler( # noqa: D417 @@ -185,25 +185,29 @@ def optimize_python_code_line_profiler( # noqa: D417 trace_id: str, line_profiler_results: str, experiment_metadata: ExperimentMetadata | None = None, - model: str | None = None, - call_sequence: int | None = None, ) -> list[OptimizedCandidate]: - """Optimize the given python code for performance by making a request to the Django endpoint. + """Optimize the given python code for performance using line profiler results. Parameters ---------- - source_code (str): The python code to optimize. - dependency_code (str): The dependency code used as read-only context for the optimization - trace_id (str): Trace id of optimization run + - line_profiler_results (str): Line profiler output to guide optimization - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization - - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). - - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None. Returns ------- - List[OptimizationCandidate]: A list of Optimization Candidates. """ + if line_profiler_results == "": + logger.info("No LineProfiler results were provided, Skipping optimization.") + return [] + + logger.info("Generating optimized candidates with line profiler…") + console.rule() + payload = { "source_code": source_code, "dependency_code": dependency_code, @@ -213,130 +217,29 @@ def optimize_python_code_line_profiler( # noqa: D417 "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, "lsp_mode": is_LSP_enabled(), - "model": model, - "call_sequence": call_sequence, } - if line_profiler_results == "": - logger.info("No LineProfiler results were provided, Skipping optimization.") - return [] try: - response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=60) + response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=120) except requests.exceptions.RequestException as e: logger.exception(f"Error generating optimized candidates: {e}") ph("cli-optimize-error-caught", {"error": str(e)}) + console.rule() return [] if response.status_code == 200: optimizations_json = response.json()["optimizations"] - logger.debug(f"Backend returned {len(optimizations_json)} LP optimization(s)") - return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP, model=model) + logger.info(f"!lsp|Received {len(optimizations_json)} line profiler optimization candidates.") + console.rule() + return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP) try: error = response.json()["error"] except Exception: error = response.text logger.error(f"Error generating optimized candidates: {response.status_code} - {error}") ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error}) - return [] - - def optimize_python_code_multi_model( - self, - source_code: str, - dependency_code: str, - base_trace_id: str, - model_distribution: list[tuple[str, int]], - executor: concurrent.futures.ThreadPoolExecutor, - experiment_metadata: ExperimentMetadata | None = None, - *, - is_async: bool = False, - sequence_offset: int = 0, - ) -> tuple[list[OptimizedCandidate], int]: - """Generate optimizations using multiple models in parallel.""" - logger.info("Generating optimized candidates…") - console.rule() - - futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] - - call_index = 0 - for model_name, num_calls in model_distribution: - for _ in range(num_calls): - call_trace_id = f"{base_trace_id[:-3]}0{call_index:02x}" - call_sequence = sequence_offset + call_index + 1 - call_index += 1 - future = executor.submit( - self.optimize_python_code, - source_code, - dependency_code, - call_trace_id, - experiment_metadata, - is_async=is_async, - model=model_name, - call_sequence=call_sequence, - ) - futures.append((future, model_name)) - - concurrent.futures.wait([f for f, _ in futures]) - - all_candidates: list[OptimizedCandidate] = [] - for future, model_name in futures: - try: - candidates = future.result() - all_candidates.extend(candidates) - except Exception as e: - logger.warning(f"Model {model_name} call failed: {e}") - continue - console.rule() - return all_candidates, call_index - - def optimize_python_code_line_profiler_multi_model( - self, - source_code: str, - dependency_code: str, - base_trace_id: str, - line_profiler_results: str, - model_distribution: list[tuple[str, int]], - executor: concurrent.futures.ThreadPoolExecutor, - experiment_metadata: ExperimentMetadata | None = None, - sequence_offset: int = 0, - ) -> tuple[list[OptimizedCandidate], int]: - """Generate line profiler optimizations using multiple models in parallel.""" - logger.info("Generating optimized candidates with line profiler…") - console.rule() - - futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] - - call_index = 0 - for model_name, num_calls in model_distribution: - for _ in range(num_calls): - call_trace_id = f"{base_trace_id[:-3]}1{call_index:02x}" - call_sequence = sequence_offset + call_index + 1 - call_index += 1 - future = executor.submit( - self.optimize_python_code_line_profiler, - source_code, - dependency_code, - call_trace_id, - line_profiler_results, - experiment_metadata, - model_name, - call_sequence, - ) - futures.append((future, model_name)) - - concurrent.futures.wait([f for f, _ in futures]) - - all_candidates: list[OptimizedCandidate] = [] - for future, model_name in futures: - try: - candidates = future.result() - all_candidates.extend(candidates) - except Exception as e: - logger.warning(f"Line profiler model {model_name} call failed: {e}") - continue - - console.rule() - return all_candidates, call_index + return [] def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index ba09989f8..88758455e 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -32,20 +32,6 @@ MAX_N_CANDIDATES = 5 MAX_N_CANDIDATES_LP = 6 -# Multi-model diversity configuration -# Each tuple is (model_name, num_calls) where each call returns 1 candidate -# Standard mode: 3 GPT-4.1 + 2 Claude Sonnet = 5 candidates -MODEL_DISTRIBUTION: list[tuple[str, int]] = [("gpt-4.1", 3), ("claude-sonnet-4-5", 2)] - -# LSP mode: fewer candidates for faster response -MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)] - -# Line profiler mode: 6 candidates total -MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [("gpt-4.1", 4), ("claude-sonnet-4-5", 2)] - -# Line profiler LSP mode -MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)] - try: from codeflash.lsp.helpers import is_LSP_enabled @@ -57,7 +43,5 @@ N_CANDIDATES_LP_EFFECTIVE = min(N_CANDIDATES_LP_LSP if _IS_LSP_ENABLED else N_CANDIDATES_LP, MAX_N_CANDIDATES_LP) N_TESTS_TO_GENERATE_EFFECTIVE = N_TESTS_TO_GENERATE_LSP if _IS_LSP_ENABLED else N_TESTS_TO_GENERATE TOTAL_LOOPING_TIME_EFFECTIVE = TOTAL_LOOPING_TIME_LSP if _IS_LSP_ENABLED else TOTAL_LOOPING_TIME -MODEL_DISTRIBUTION_EFFECTIVE = MODEL_DISTRIBUTION_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION -MODEL_DISTRIBUTION_LP_EFFECTIVE = MODEL_DISTRIBUTION_LP_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION_LP MAX_CONTEXT_LEN_REVIEW = 1000 diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 6228ee01a..138fe9424 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -46,8 +46,6 @@ COVERAGE_THRESHOLD, INDIVIDUAL_TESTCASE_TIMEOUT, MAX_REPAIRS_PER_TRACE, - MODEL_DISTRIBUTION_EFFECTIVE, - MODEL_DISTRIBUTION_LP_EFFECTIVE, N_TESTS_TO_GENERATE_EFFECTIVE, REFINE_ALL_THRESHOLD, REFINED_CANDIDATE_RANKING_WEIGHTS, @@ -139,7 +137,6 @@ def __init__( ai_service_client: AiServiceClient, executor: concurrent.futures.ThreadPoolExecutor, future_all_code_repair: list[concurrent.futures.Future], - sequence_offset: int = 0, ) -> None: self.candidate_queue = queue.Queue() self.line_profiler_done = False @@ -147,8 +144,6 @@ def __init__( self.candidate_len = len(initial_candidates) self.ai_service_client = ai_service_client self.executor = executor - self.sequence_offset = sequence_offset - self.lp_calls_count = 0 self.refinement_calls_count = 0 # Initialize queue with initial candidates @@ -160,7 +155,7 @@ def __init__( self.future_all_code_repair = future_all_code_repair def get_total_llm_calls(self) -> int: - return self.sequence_offset + self.lp_calls_count + self.refinement_calls_count + return self.refinement_calls_count def get_next_candidate(self) -> OptimizedCandidate | None: """Get the next candidate from the queue, handling async results as needed.""" @@ -183,11 +178,7 @@ def _process_line_profiler_results(self) -> OptimizedCandidate | None: """Process line profiler results and add to queue.""" logger.debug("all candidates processed, await candidates from line profiler") concurrent.futures.wait([self.future_line_profile_results]) - result = self.future_line_profile_results.result() - - # LP multi-model now returns (candidates, lp_call_count) - line_profile_results, lp_call_count = result - self.lp_calls_count = lp_call_count + line_profile_results = self.future_line_profile_results.result() for candidate in line_profile_results: self.candidate_queue.put(candidate) @@ -203,18 +194,13 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur def _process_refinement_results(self) -> OptimizedCandidate | None: """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined.""" - import dataclasses - future_refinements: list[concurrent.futures.Future] = [] - # Calculate base sequence: offset + lp_calls (refinements come after LP) - base_sequence = self.sequence_offset + self.lp_calls_count refinement_call_index = 0 if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD: for data in self.all_refinements_data: refinement_call_index += 1 - data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index) - future_refinements.append(self.refine_optimizations([data_with_seq])) + future_refinements.append(self.refine_optimizations([data])) else: diff_lens_list = [] runtimes_list = [] @@ -235,8 +221,7 @@ def _process_refinement_results(self) -> OptimizedCandidate | None: for idx in top_indecies: refinement_call_index += 1 data = self.all_refinements_data[idx] - data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index) - future_refinements.append(self.refine_optimizations([data_with_seq])) + future_refinements.append(self.refine_optimizations([data])) # Track total refinement calls made self.refinement_calls_count = refinement_call_index @@ -342,10 +327,8 @@ def __init__( self.optimization_review = "" self.future_all_code_repair: list[concurrent.futures.Future] = [] self.repair_counter = 0 # track how many repairs we did for each function - self.test_gen_calls_count = 0 - self.optimize_calls_count = 0 - self.lp_calls_count = 0 - self.total_llm_calls = 0 + # Counter for post-optimization LLM calls (explanation, review) - optimization calls are handled by backend + self.post_optimization_call_count = 0 def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]: should_run_experiment = self.experiment_id is not None @@ -949,19 +932,16 @@ def determine_best_candidate( assert ai_service_client is not None, "AI service client must be set for optimization" future_line_profile_results = self.executor.submit( - ai_service_client.optimize_python_code_line_profiler_multi_model, + ai_service_client.optimize_python_code_line_profiler, source_code=code_context.read_writable_code.markdown, dependency_code=code_context.read_only_context_code, - base_trace_id=self.get_trace_id(exp_type), + trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], - model_distribution=MODEL_DISTRIBUTION_LP_EFFECTIVE, - executor=self.executor, experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) if self.experiment_id else None, - sequence_offset=self.optimize_calls_count, ) processor = CandidateProcessor( @@ -971,7 +951,6 @@ def determine_best_candidate( self.aiservice_client, self.executor, self.future_all_code_repair, - sequence_offset=self.optimize_calls_count, ) candidate_index = 0 @@ -1005,9 +984,6 @@ def determine_best_candidate( self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path ) - # Track total LLM calls from the processor for sequence numbering - self.total_llm_calls = processor.get_total_llm_calls() - # Select and return the best optimization best_optimization = self.select_best_optimization( eval_ctx=eval_ctx, @@ -1386,17 +1362,14 @@ def generate_optimizations( read_only_context_code: str, run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: - """Generate optimization candidates for the function using multiple models in parallel.""" + """Generate optimization candidates for the function. Backend handles multi-model diversity.""" future_optimization_candidates = self.executor.submit( - self.aiservice_client.optimize_python_code_multi_model, + self.aiservice_client.optimize_python_code, read_writable_code.markdown, read_only_context_code, self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id, - MODEL_DISTRIBUTION_EFFECTIVE, - self.executor, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, - sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, ) future_references = self.executor.submit( @@ -1413,34 +1386,29 @@ def generate_optimizations( if run_experiment: future_candidates_exp = self.executor.submit( - self.local_aiservice_client.optimize_python_code_multi_model, + self.local_aiservice_client.optimize_python_code, read_writable_code.markdown, read_only_context_code, self.function_trace_id[:-4] + "EXP1", - MODEL_DISTRIBUTION_EFFECTIVE, - self.executor, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, - sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, ) futures.append(future_candidates_exp) # Wait for optimization futures to complete concurrent.futures.wait(futures) - # Retrieve results - optimize_python_code_multi_model returns (candidates, call_count) - candidates, optimize_call_count = future_optimization_candidates.result() - # Total sequence count = test gen calls + optimization calls (LP will continue from here) - self.optimize_calls_count = N_TESTS_TO_GENERATE_EFFECTIVE + optimize_call_count - logger.info(f"!lsp|Completed {optimize_call_count} optimization calls, got {len(candidates)} candidates.") + # Retrieve results - optimize_python_code returns list of candidates + candidates = future_optimization_candidates.result() + logger.info(f"!lsp|Received {len(candidates)} optimization candidates.") if not candidates: return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}") - # Handle experiment results - also returns (candidates, call_count) tuple + # Handle experiment results candidates_experiment = None if future_candidates_exp: - candidates_experiment, _ = future_candidates_exp.result() + candidates_experiment = future_candidates_exp.result() function_references = future_references.result() return Success((OptimizationSet(control=candidates, experiment=candidates_experiment), function_references)) @@ -1687,9 +1655,9 @@ def process_review( ) throughput_improvement_str = f"{throughput_improvement_value * 100:.1f}%" - # Explanation call continues the sequence numbering - explanation_call_sequence = self.total_llm_calls + 1 - self.total_llm_calls = explanation_call_sequence + # Explanation call sequence for tracking + self.post_optimization_call_count += 1 + explanation_call_sequence = self.post_optimization_call_count new_explanation_raw_str = self.aiservice_client.get_new_explanation( source_code=code_context.read_writable_code.flat, @@ -1744,9 +1712,9 @@ def process_review( staging_review = self.args.staging_review opt_review_response = "" # this will now run regardless of pr, staging review flags - # Optimization review call continues the sequence numbering - review_call_sequence = self.total_llm_calls + 1 - self.total_llm_calls = review_call_sequence + # Review call sequence for tracking + self.post_optimization_call_count += 1 + review_call_sequence = self.post_optimization_call_count try: opt_review_response = self.aiservice_client.get_optimization_review( @@ -2241,9 +2209,6 @@ def submit_test_generation_tasks( generated_test_paths: list[Path], generated_perf_test_paths: list[Path], ) -> list[concurrent.futures.Future]: - # Track how many test generation calls we're making for sequence numbering - self.test_gen_calls_count = len(generated_test_paths) - return [ executor.submit( generate_tests, From 04ff30a0e7e3e88cd8cce6817ca4a73ad96ef8a1 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Fri, 26 Dec 2025 16:06:35 -0500 Subject: [PATCH 08/11] fix call sequence --- codeflash/api/aiservice.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index e858726b1..a65e0e947 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -151,6 +151,7 @@ def optimize_python_code( # noqa: D417 "repo_name": git_repo_name, "is_async": is_async, "lsp_mode": is_LSP_enabled(), + "call_sequence": 1, } logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}") @@ -217,6 +218,7 @@ def optimize_python_code_line_profiler( # noqa: D417 "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, "lsp_mode": is_LSP_enabled(), + "call_sequence": 1, } try: From 9c03391c5d385cc3af8eb1c05e43bd5a8509f51d Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Fri, 26 Dec 2025 17:20:42 -0500 Subject: [PATCH 09/11] count properly --- codeflash/api/aiservice.py | 22 +++++++++++--------- codeflash/optimization/function_optimizer.py | 14 +------------ codeflash/verification/verifier.py | 2 -- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index a65e0e947..df279fd6f 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -4,6 +4,7 @@ import os import platform import time +from itertools import count from typing import TYPE_CHECKING, Any, cast import requests @@ -39,6 +40,11 @@ class AiServiceClient: def __init__(self) -> None: self.base_url = self.get_aiservice_base_url() self.headers = {"Authorization": f"Bearer {get_codeflash_api_key()}", "Connection": "close"} + self.llm_call_counter = count(1) + + def get_next_sequence(self) -> int: + """Get the next LLM call sequence number.""" + return next(self.llm_call_counter) def get_aiservice_base_url(self) -> str: if os.environ.get("CODEFLASH_AIS_SERVER", default="prod").lower() == "local": @@ -151,7 +157,7 @@ def optimize_python_code( # noqa: D417 "repo_name": git_repo_name, "is_async": is_async, "lsp_mode": is_LSP_enabled(), - "call_sequence": 1, + "call_sequence": self.get_next_sequence(), } logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}") @@ -218,7 +224,7 @@ def optimize_python_code_line_profiler( # noqa: D417 "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, "lsp_mode": is_LSP_enabled(), - "call_sequence": 1, + "call_sequence": self.get_next_sequence(), } try: @@ -269,7 +275,7 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest] "trace_id": opt.trace_id, "function_references": opt.function_references, "python_version": platform.python_version(), - "call_sequence": opt.call_sequence, + "call_sequence": self.get_next_sequence(), } for opt in request ] @@ -359,7 +365,6 @@ def get_new_explanation( # noqa: D417 throughput_improvement: str | None = None, function_references: str | None = None, codeflash_version: str = codeflash_version, - call_sequence: int | None = None, ) -> str: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -405,7 +410,7 @@ def get_new_explanation( # noqa: D417 "throughput_improvement": throughput_improvement, "function_references": function_references, "codeflash_version": codeflash_version, - "call_sequence": call_sequence, + "call_sequence": self.get_next_sequence(), } logger.info("loading|Generating explanation") console.rule() @@ -533,7 +538,6 @@ def generate_regression_tests( # noqa: D417 test_timeout: int, trace_id: str, test_index: int, - call_sequence: int | None = None, ) -> tuple[str, str, str] | None: """Generate regression tests for the given function by making a request to the Django endpoint. @@ -569,7 +573,7 @@ def generate_regression_tests( # noqa: D417 "python_version": platform.python_version(), "codeflash_version": codeflash_version, "is_async": function_to_optimize.is_async, - "call_sequence": call_sequence, + "call_sequence": self.get_next_sequence(), } try: response = self.make_ai_service_request("/testgen", payload=payload, timeout=90) @@ -610,7 +614,6 @@ def get_optimization_review( replay_tests: str, concolic_tests: str, # noqa: ARG002 calling_fn_details: str, - call_sequence: int | None = None, ) -> str: """Compute the optimization review of current Pull Request. @@ -626,7 +629,6 @@ def get_optimization_review( root_dir: Path -> path of git directory concolic_tests: str -> concolic_tests (not used) calling_fn_details: str -> filenames and definitions of functions which call the function_to_optimize - call_sequence: int | None -> sequence number for multi-model calls Returns: ------- @@ -658,7 +660,7 @@ def get_optimization_review( "codeflash_version": codeflash_version, "calling_fn_details": calling_fn_details, "python_version": platform.python_version(), - "call_sequence": call_sequence, + "call_sequence": self.get_next_sequence(), } console.rule() try: diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 138fe9424..45adf8d44 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -327,8 +327,6 @@ def __init__( self.optimization_review = "" self.future_all_code_repair: list[concurrent.futures.Future] = [] self.repair_counter = 0 # track how many repairs we did for each function - # Counter for post-optimization LLM calls (explanation, review) - optimization calls are handled by backend - self.post_optimization_call_count = 0 def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]: should_run_experiment = self.experiment_id is not None @@ -1655,10 +1653,6 @@ def process_review( ) throughput_improvement_str = f"{throughput_improvement_value * 100:.1f}%" - # Explanation call sequence for tracking - self.post_optimization_call_count += 1 - explanation_call_sequence = self.post_optimization_call_count - new_explanation_raw_str = self.aiservice_client.get_new_explanation( source_code=code_context.read_writable_code.flat, dependency_code=code_context.read_only_context_code, @@ -1676,7 +1670,6 @@ def process_review( optimized_throughput=optimized_throughput_str, throughput_improvement=throughput_improvement_str, function_references=function_references, - call_sequence=explanation_call_sequence, ) new_explanation = Explanation( raw_explanation_message=new_explanation_raw_str or explanation.raw_explanation_message, @@ -1712,13 +1705,9 @@ def process_review( staging_review = self.args.staging_review opt_review_response = "" # this will now run regardless of pr, staging review flags - # Review call sequence for tracking - self.post_optimization_call_count += 1 - review_call_sequence = self.post_optimization_call_count - try: opt_review_response = self.aiservice_client.get_optimization_review( - **data, calling_fn_details=function_references, call_sequence=review_call_sequence + **data, calling_fn_details=function_references ) except Exception as e: logger.debug(f"optimization review response failed, investigate {e}") @@ -2223,7 +2212,6 @@ def submit_test_generation_tasks( test_index, test_path, test_perf_path, - call_sequence=test_index + 1, ) for test_index, (test_path, test_perf_path) in enumerate( zip(generated_test_paths, generated_perf_test_paths) diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py index d94455df3..8d187f2b1 100644 --- a/codeflash/verification/verifier.py +++ b/codeflash/verification/verifier.py @@ -27,7 +27,6 @@ def generate_tests( test_index: int, test_path: Path, test_perf_path: Path, - call_sequence: int | None = None, ) -> tuple[str, str, Path] | None: # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original # class import. Remove the recreation of the class definition @@ -43,7 +42,6 @@ def generate_tests( test_timeout=test_timeout, trace_id=function_trace_id, test_index=test_index, - call_sequence=call_sequence, ) if response and isinstance(response, tuple) and len(response) == 3: generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response From 2237ad6b893070ed480cafb5a9a5f08658b3090c Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 30 Dec 2025 03:49:36 -0500 Subject: [PATCH 10/11] cleanup UI --- codeflash/discovery/discover_unit_tests.py | 1 + codeflash/optimization/function_optimizer.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py index bc0e2fd67..587b972ee 100644 --- a/codeflash/discovery/discover_unit_tests.py +++ b/codeflash/discovery/discover_unit_tests.py @@ -751,6 +751,7 @@ def process_test_files( tests_cache = TestsCache(project_root_path) logger.info("!lsp|Discovering tests and processing unit tests") + console.rule() with test_files_progress_bar(total=len(file_to_test_map), description="Processing test files") as ( progress, task_id, diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 45adf8d44..2a65b57ba 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -245,6 +245,7 @@ def _process_refinement_results(self) -> OptimizedCandidate | None: logger.info( f"Added {len(refinement_response)} candidates from refinement, total candidates now: {self.candidate_len}" ) + console.rule() self.refinement_done = True return self.get_next_candidate() @@ -1213,7 +1214,6 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio func_qualname = self.function_to_optimize.qualified_name_with_modules_from_root(self.project_root) if func_qualname not in function_to_all_tests: logger.info(f"Did not find any pre-existing tests for '{func_qualname}', will only use generated tests.") - console.rule() else: test_file_invocation_positions = defaultdict(list) for tests_in_file in function_to_all_tests.get(func_qualname): @@ -1349,7 +1349,8 @@ def generate_tests( if concolic_test_str: count_tests += 1 - logger.info(f"!lsp|Generated '{count_tests}' tests for '{self.function_to_optimize.function_name}'") + logger.info(f"!lsp|Generated {count_tests} tests for '{self.function_to_optimize.function_name}'") + console.rule() generated_tests = GeneratedTestsList(generated_tests=tests) return Success((count_tests, generated_tests, function_to_concolic_tests, concolic_test_str)) @@ -1398,7 +1399,6 @@ def generate_optimizations( # Retrieve results - optimize_python_code returns list of candidates candidates = future_optimization_candidates.result() - logger.info(f"!lsp|Received {len(candidates)} optimization candidates.") if not candidates: return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}") @@ -2026,6 +2026,7 @@ def run_optimized_candidate( return self.get_results_not_matched_error() logger.info(f"loading|Running performance tests for candidate {optimization_candidate_index}...") + console.rule() # For async functions, instrument at definition site for performance benchmarking if self.function_to_optimize.is_async: From 1be8302255da4875b5d177d1e57706fad3cf6175 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Wed, 31 Dec 2025 03:12:35 -0500 Subject: [PATCH 11/11] revert timeouts --- codeflash/api/aiservice.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index df279fd6f..c18495899 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -162,7 +162,7 @@ def optimize_python_code( # noqa: D417 logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}") try: - response = self.make_ai_service_request("/optimize", payload=payload, timeout=120) + response = self.make_ai_service_request("/optimize", payload=payload, timeout=60) except requests.exceptions.RequestException as e: logger.exception(f"Error generating optimized candidates: {e}") ph("cli-optimize-error-caught", {"error": str(e)}) @@ -228,7 +228,7 @@ def optimize_python_code_line_profiler( # noqa: D417 } try: - response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=120) + response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=60) except requests.exceptions.RequestException as e: logger.exception(f"Error generating optimized candidates: {e}") ph("cli-optimize-error-caught", {"error": str(e)})