From 3ee6e90bdd9f6ab210d3ae34b399bd8d25acbd1f Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 23 Dec 2025 03:31:07 -0500
Subject: [PATCH 01/11] divertsity

---
 codeflash/code_utils/config_consts.py        | 28 ++++++++++++++++++++
 codeflash/models/models.py                   |  1 +
 codeflash/optimization/function_optimizer.py | 27 +++++++++++--------
 3 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
index 88758455e..aa31d8063 100644
--- a/codeflash/code_utils/config_consts.py
+++ b/codeflash/code_utils/config_consts.py
@@ -32,6 +32,32 @@
 MAX_N_CANDIDATES = 5
 MAX_N_CANDIDATES_LP = 6
 
+# Multi-model diversity configuration
+# Each tuple is (model_name, num_calls) where each call returns 1 candidate
+# Standard mode: 3 GPT-4.1 + 2 Claude Sonnet = 5 candidates
+MODEL_DISTRIBUTION: list[tuple[str, int]] = [
+    ("gpt-4.1", 3),
+    ("claude-sonnet-4-5", 2),
+]
+
+# LSP mode: fewer candidates for faster response
+MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [
+    ("gpt-4.1", 2),
+    ("claude-sonnet-4-5", 1),
+]
+
+# Line profiler mode: 6 candidates total
+MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [
+    ("gpt-4.1", 4),
+    ("claude-sonnet-4-5", 2),
+]
+
+# Line profiler LSP mode
+MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [
+    ("gpt-4.1", 2),
+    ("claude-sonnet-4-5", 1),
+]
+
 try:
     from codeflash.lsp.helpers import is_LSP_enabled
 
@@ -43,5 +69,7 @@
 N_CANDIDATES_LP_EFFECTIVE = min(N_CANDIDATES_LP_LSP if _IS_LSP_ENABLED else N_CANDIDATES_LP, MAX_N_CANDIDATES_LP)
 N_TESTS_TO_GENERATE_EFFECTIVE = N_TESTS_TO_GENERATE_LSP if _IS_LSP_ENABLED else N_TESTS_TO_GENERATE
 TOTAL_LOOPING_TIME_EFFECTIVE = TOTAL_LOOPING_TIME_LSP if _IS_LSP_ENABLED else TOTAL_LOOPING_TIME
+MODEL_DISTRIBUTION_EFFECTIVE = MODEL_DISTRIBUTION_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION
+MODEL_DISTRIBUTION_LP_EFFECTIVE = MODEL_DISTRIBUTION_LP_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION_LP
 
 MAX_CONTEXT_LEN_REVIEW = 1000
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
index 1db09bc12..4f7553818 100644
--- a/codeflash/models/models.py
+++ b/codeflash/models/models.py
@@ -464,6 +464,7 @@ class OptimizedCandidate:
     optimization_id: str
     source: OptimizedCandidateSource
     parent_id: str | None = None
+    model: str | None = None  # Which LLM model generated this candidate
 
 
 @dataclass(frozen=True)
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 416bdc8df..8776d9c58 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -46,6 +46,8 @@
     COVERAGE_THRESHOLD,
     INDIVIDUAL_TESTCASE_TIMEOUT,
     MAX_REPAIRS_PER_TRACE,
+    MODEL_DISTRIBUTION_EFFECTIVE,
+    MODEL_DISTRIBUTION_LP_EFFECTIVE,
     N_CANDIDATES_EFFECTIVE,
     N_CANDIDATES_LP_EFFECTIVE,
     N_TESTS_TO_GENERATE_EFFECTIVE,
@@ -921,18 +923,20 @@ def determine_best_candidate(
         ai_service_client = self.aiservice_client if exp_type == "EXP0" else self.local_aiservice_client
         assert ai_service_client is not None, "AI service client must be set for optimization"
 
+        # Use multi-model approach for line profiler optimization
         future_line_profile_results = self.executor.submit(
-            ai_service_client.optimize_python_code_line_profiler,
+            ai_service_client.optimize_python_code_line_profiler_multi_model,
             source_code=code_context.read_writable_code.markdown,
             dependency_code=code_context.read_only_context_code,
-            trace_id=self.get_trace_id(exp_type),
+            base_trace_id=self.get_trace_id(exp_type),
             line_profiler_results=original_code_baseline.line_profile_results["str_out"],
-            num_candidates=N_CANDIDATES_LP_EFFECTIVE,
+            model_distribution=MODEL_DISTRIBUTION_LP_EFFECTIVE,
             experiment_metadata=ExperimentMetadata(
                 id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment"
             )
             if self.experiment_id
             else None,
+            executor=self.executor,
         )
 
         processor = CandidateProcessor(
@@ -1353,17 +1357,17 @@ def generate_optimizations(
         read_only_context_code: str,
         run_experiment: bool = False,  # noqa: FBT001, FBT002
     ) -> Result[tuple[OptimizationSet, str], str]:
-        """Generate optimization candidates for the function."""
-        n_candidates = N_CANDIDATES_EFFECTIVE
-
+        """Generate optimization candidates for the function using multiple models in parallel."""
+        # Use multi-model approach for diversity
         future_optimization_candidates = self.executor.submit(
-            self.aiservice_client.optimize_python_code,
+            self.aiservice_client.optimize_python_code_multi_model,
             read_writable_code.markdown,
             read_only_context_code,
             self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id,
-            n_candidates,
+            MODEL_DISTRIBUTION_EFFECTIVE,
             ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None,
             is_async=self.function_to_optimize.is_async,
+            executor=self.executor,
         )
 
         future_references = self.executor.submit(
@@ -1380,13 +1384,14 @@ def generate_optimizations(
 
         if run_experiment:
             future_candidates_exp = self.executor.submit(
-                self.local_aiservice_client.optimize_python_code,
+                self.local_aiservice_client.optimize_python_code_multi_model,
                 read_writable_code.markdown,
                 read_only_context_code,
                 self.function_trace_id[:-4] + "EXP1",
-                n_candidates,
+                MODEL_DISTRIBUTION_EFFECTIVE,
                 ExperimentMetadata(id=self.experiment_id, group="experiment"),
                 is_async=self.function_to_optimize.is_async,
+                executor=self.executor,
             )
             futures.append(future_candidates_exp)
 
@@ -1395,7 +1400,7 @@ def generate_optimizations(
 
         # Retrieve results
         candidates: list[OptimizedCandidate] = future_optimization_candidates.result()
-        logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations.")
+        logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations from multiple models.")
 
         if not candidates:
             return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}")

From 35ae79e8e6f778440bae4bdae5f87c863cdf6acd Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 23 Dec 2025 03:42:32 -0500
Subject: [PATCH 02/11] add diversity

---
 codeflash/api/aiservice.py                   | 110 ++++++++++++++++++-
 codeflash/code_utils/config_consts.py        |  20 +---
 codeflash/optimization/function_optimizer.py |   5 -
 3 files changed, 108 insertions(+), 27 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index 86fb125b7..78d042791 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import concurrent.futures
 import json
 import os
 import platform
@@ -12,7 +13,6 @@
 from codeflash.cli_cmds.console import console, logger
 from codeflash.code_utils.code_replacer import is_zero_diff
 from codeflash.code_utils.code_utils import unified_diff_strings
-from codeflash.code_utils.config_consts import N_CANDIDATES_EFFECTIVE, N_CANDIDATES_LP_EFFECTIVE
 from codeflash.code_utils.env_utils import get_codeflash_api_key
 from codeflash.code_utils.git_utils import get_last_commit_author_if_pr_exists, get_repo_owner_and_name
 from codeflash.code_utils.time_utils import humanize_runtime
@@ -35,6 +35,8 @@
     from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest
     from codeflash.result.explanation import Explanation
 
+multi_model_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10, thread_name_prefix="multi_model")
+
 
 class AiServiceClient:
     def __init__(self) -> None:
@@ -92,7 +94,7 @@ def make_ai_service_request(
         return response
 
     def _get_valid_candidates(
-        self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource
+        self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource, model: str | None = None
     ) -> list[OptimizedCandidate]:
         candidates: list[OptimizedCandidate] = []
         for opt in optimizations_json:
@@ -106,6 +108,7 @@ def _get_valid_candidates(
                     optimization_id=opt["optimization_id"],
                     source=source,
                     parent_id=opt.get("parent_id", None),
+                    model=model,
                 )
             )
         return candidates
@@ -119,6 +122,7 @@ def optimize_python_code(  # noqa: D417
         experiment_metadata: ExperimentMetadata | None = None,
         *,
         is_async: bool = False,
+        model: str | None = None,
     ) -> list[OptimizedCandidate]:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -129,6 +133,7 @@ def optimize_python_code(  # noqa: D417
         - trace_id (str): Trace id of optimization run
         - num_candidates (int): Number of optimization variants to generate. Default is 10.
         - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization
+        - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default).
 
         Returns
         -------
@@ -149,8 +154,9 @@ def optimize_python_code(  # noqa: D417
             "current_username": get_last_commit_author_if_pr_exists(None),
             "repo_owner": git_repo_owner,
             "repo_name": git_repo_name,
-            "n_candidates": N_CANDIDATES_EFFECTIVE,
+            "n_candidates": num_candidates,
             "is_async": is_async,
+            "model": model,
         }
 
         logger.info("!lsp|Generating optimized candidates…")
@@ -167,7 +173,7 @@ def optimize_python_code(  # noqa: D417
             console.rule()
             end_time = time.perf_counter()
             logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.")
-            return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE)
+            return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE, model=model)
         try:
             error = response.json()["error"]
         except Exception:
@@ -185,6 +191,7 @@ def optimize_python_code_line_profiler(  # noqa: D417
         line_profiler_results: str,
         num_candidates: int = 10,
         experiment_metadata: ExperimentMetadata | None = None,
+        model: str | None = None,
     ) -> list[OptimizedCandidate]:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -195,6 +202,7 @@ def optimize_python_code_line_profiler(  # noqa: D417
         - trace_id (str): Trace id of optimization run
         - num_candidates (int): Number of optimization variants to generate. Default is 10.
         - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization
+        - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default).
 
         Returns
         -------
@@ -211,7 +219,8 @@ def optimize_python_code_line_profiler(  # noqa: D417
             "experiment_metadata": experiment_metadata,
             "codeflash_version": codeflash_version,
             "lsp_mode": is_LSP_enabled(),
-            "n_candidates_lp": N_CANDIDATES_LP_EFFECTIVE,
+            "n_candidates_lp": num_candidates,
+            "model": model,
         }
 
         console.rule()
@@ -232,7 +241,7 @@ def optimize_python_code_line_profiler(  # noqa: D417
                 f"!lsp|Generated {len(optimizations_json)} candidate optimizations using line profiler information."
             )
             console.rule()
-            return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP)
+            return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP, model=model)
         try:
             error = response.json()["error"]
         except Exception:
@@ -242,6 +251,95 @@ def optimize_python_code_line_profiler(  # noqa: D417
         console.rule()
         return []
 
+    def optimize_python_code_multi_model(
+        self,
+        source_code: str,
+        dependency_code: str,
+        base_trace_id: str,
+        model_distribution: list[tuple[str, int]],
+        experiment_metadata: ExperimentMetadata | None = None,
+        *,
+        is_async: bool = False,
+    ) -> list[OptimizedCandidate]:
+        """Generate optimizations using multiple models in parallel."""
+        futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = []
+        call_index = 0
+
+        for model_name, num_calls in model_distribution:
+            for _ in range(num_calls):
+                call_trace_id = f"{base_trace_id[:-4]}M{call_index:02d}"
+                call_index += 1
+
+                future = multi_model_executor.submit(
+                    self.optimize_python_code,
+                    source_code,
+                    dependency_code,
+                    call_trace_id,
+                    num_candidates=1,  # Each call returns 1 candidate
+                    experiment_metadata=experiment_metadata,
+                    is_async=is_async,
+                    model=model_name,
+                )
+                futures.append((future, model_name))
+
+        # Wait for all calls to complete
+        concurrent.futures.wait([f for f, _ in futures])
+
+        # Collect results
+        all_candidates: list[OptimizedCandidate] = []
+        for future, model_name in futures:
+            try:
+                candidates = future.result()
+                all_candidates.extend(candidates)
+            except Exception as e:
+                logger.warning(f"Model {model_name} call failed: {e}")
+                continue
+
+        return all_candidates
+
+    def optimize_python_code_line_profiler_multi_model(
+        self,
+        source_code: str,
+        dependency_code: str,
+        base_trace_id: str,
+        line_profiler_results: str,
+        model_distribution: list[tuple[str, int]],
+        experiment_metadata: ExperimentMetadata | None = None,
+    ) -> list[OptimizedCandidate]:
+        """Generate line profiler optimizations using multiple models in parallel."""
+        futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = []
+        call_index = 0
+
+        for model_name, num_calls in model_distribution:
+            for _ in range(num_calls):
+                call_trace_id = f"{base_trace_id[:-4]}L{call_index:02d}"
+                call_index += 1
+
+                future = multi_model_executor.submit(
+                    self.optimize_python_code_line_profiler,
+                    source_code,
+                    dependency_code,
+                    call_trace_id,
+                    line_profiler_results,
+                    num_candidates=1,
+                    experiment_metadata=experiment_metadata,
+                    model=model_name,
+                )
+                futures.append((future, model_name))
+
+        concurrent.futures.wait([f for f, _ in futures])
+
+        all_candidates: list[OptimizedCandidate] = []
+        for future, model_name in futures:
+            try:
+                candidates = future.result()
+                all_candidates.extend(candidates)
+            except Exception as e:
+                logger.warning(f"Line profiler model {model_name} call failed: {e}")
+                continue
+
+        return all_candidates
+
     def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]) -> list[OptimizedCandidate]:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
index aa31d8063..ba09989f8 100644
--- a/codeflash/code_utils/config_consts.py
+++ b/codeflash/code_utils/config_consts.py
@@ -35,28 +35,16 @@
 # Multi-model diversity configuration
 # Each tuple is (model_name, num_calls) where each call returns 1 candidate
 # Standard mode: 3 GPT-4.1 + 2 Claude Sonnet = 5 candidates
-MODEL_DISTRIBUTION: list[tuple[str, int]] = [
-    ("gpt-4.1", 3),
-    ("claude-sonnet-4-5", 2),
-]
+MODEL_DISTRIBUTION: list[tuple[str, int]] = [("gpt-4.1", 3), ("claude-sonnet-4-5", 2)]
 
 # LSP mode: fewer candidates for faster response
-MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [
-    ("gpt-4.1", 2),
-    ("claude-sonnet-4-5", 1),
-]
+MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)]
 
 # Line profiler mode: 6 candidates total
-MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [
-    ("gpt-4.1", 4),
-    ("claude-sonnet-4-5", 2),
-]
+MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [("gpt-4.1", 4), ("claude-sonnet-4-5", 2)]
 
 # Line profiler LSP mode
-MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [
-    ("gpt-4.1", 2),
-    ("claude-sonnet-4-5", 1),
-]
+MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)]
 
 try:
     from codeflash.lsp.helpers import is_LSP_enabled
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 8776d9c58..afd56519e 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -48,8 +48,6 @@
     MAX_REPAIRS_PER_TRACE,
     MODEL_DISTRIBUTION_EFFECTIVE,
     MODEL_DISTRIBUTION_LP_EFFECTIVE,
-    N_CANDIDATES_EFFECTIVE,
-    N_CANDIDATES_LP_EFFECTIVE,
     N_TESTS_TO_GENERATE_EFFECTIVE,
     REFINE_ALL_THRESHOLD,
     REFINED_CANDIDATE_RANKING_WEIGHTS,
@@ -936,7 +934,6 @@ def determine_best_candidate(
             )
             if self.experiment_id
             else None,
-            executor=self.executor,
         )
 
         processor = CandidateProcessor(
@@ -1367,7 +1364,6 @@ def generate_optimizations(
             MODEL_DISTRIBUTION_EFFECTIVE,
             ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None,
             is_async=self.function_to_optimize.is_async,
-            executor=self.executor,
         )
 
         future_references = self.executor.submit(
@@ -1391,7 +1387,6 @@ def generate_optimizations(
                 MODEL_DISTRIBUTION_EFFECTIVE,
                 ExperimentMetadata(id=self.experiment_id, group="experiment"),
                 is_async=self.function_to_optimize.is_async,
-                executor=self.executor,
             )
             futures.append(future_candidates_exp)
 

From cdf85d2c8be74d37a2352b33906b564cfd7fc123 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 23 Dec 2025 03:56:02 -0500
Subject: [PATCH 03/11] implement trace_id observability

---
 codeflash/api/aiservice.py                   | 77 +++++++++++---------
 codeflash/models/models.py                   |  1 +
 codeflash/optimization/function_optimizer.py | 72 +++++++++++++++---
 codeflash/verification/verifier.py           |  2 +
 4 files changed, 105 insertions(+), 47 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index 78d042791..4dca8096c 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -118,11 +118,11 @@ def optimize_python_code(  # noqa: D417
         source_code: str,
         dependency_code: str,
         trace_id: str,
-        num_candidates: int = 10,
         experiment_metadata: ExperimentMetadata | None = None,
         *,
         is_async: bool = False,
         model: str | None = None,
+        call_sequence: int | None = None,
     ) -> list[OptimizedCandidate]:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -131,9 +131,9 @@ def optimize_python_code(  # noqa: D417
         - source_code (str): The python code to optimize.
         - dependency_code (str): The dependency code used as read-only context for the optimization
         - trace_id (str): Trace id of optimization run
-        - num_candidates (int): Number of optimization variants to generate. Default is 10.
         - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization
         - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default).
+        - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None.
 
         Returns
         -------
@@ -146,7 +146,6 @@ def optimize_python_code(  # noqa: D417
         payload = {
             "source_code": source_code,
             "dependency_code": dependency_code,
-            "num_variants": num_candidates,
             "trace_id": trace_id,
             "python_version": platform.python_version(),
             "experiment_metadata": experiment_metadata,
@@ -154,13 +153,12 @@ def optimize_python_code(  # noqa: D417
             "current_username": get_last_commit_author_if_pr_exists(None),
             "repo_owner": git_repo_owner,
             "repo_name": git_repo_name,
-            "n_candidates": num_candidates,
             "is_async": is_async,
             "model": model,
+            "call_sequence": call_sequence,
         }
+        logger.debug(f"Sending optimize request: model={model}, trace_id={trace_id}, call_sequence={call_sequence}")
 
-        logger.info("!lsp|Generating optimized candidates…")
-        console.rule()
         try:
             response = self.make_ai_service_request("/optimize", payload=payload, timeout=60)
         except requests.exceptions.RequestException as e:
@@ -170,9 +168,9 @@ def optimize_python_code(  # noqa: D417
 
         if response.status_code == 200:
             optimizations_json = response.json()["optimizations"]
-            console.rule()
             end_time = time.perf_counter()
             logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.")
+            logger.debug(f"Backend returned {len(optimizations_json)} optimization(s)")
             return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE, model=model)
         try:
             error = response.json()["error"]
@@ -180,7 +178,6 @@ def optimize_python_code(  # noqa: D417
             error = response.text
         logger.error(f"Error generating optimized candidates: {response.status_code} - {error}")
         ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error})
-        console.rule()
         return []
 
     def optimize_python_code_line_profiler(  # noqa: D417
@@ -189,9 +186,9 @@ def optimize_python_code_line_profiler(  # noqa: D417
         dependency_code: str,
         trace_id: str,
         line_profiler_results: str,
-        num_candidates: int = 10,
         experiment_metadata: ExperimentMetadata | None = None,
         model: str | None = None,
+        call_sequence: int | None = None,
     ) -> list[OptimizedCandidate]:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -200,9 +197,9 @@ def optimize_python_code_line_profiler(  # noqa: D417
         - source_code (str): The python code to optimize.
         - dependency_code (str): The dependency code used as read-only context for the optimization
         - trace_id (str): Trace id of optimization run
-        - num_candidates (int): Number of optimization variants to generate. Default is 10.
         - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization
         - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default).
+        - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None.
 
         Returns
         -------
@@ -212,21 +209,18 @@ def optimize_python_code_line_profiler(  # noqa: D417
         payload = {
             "source_code": source_code,
             "dependency_code": dependency_code,
-            "num_variants": num_candidates,
             "line_profiler_results": line_profiler_results,
             "trace_id": trace_id,
             "python_version": platform.python_version(),
             "experiment_metadata": experiment_metadata,
             "codeflash_version": codeflash_version,
             "lsp_mode": is_LSP_enabled(),
-            "n_candidates_lp": num_candidates,
             "model": model,
+            "call_sequence": call_sequence,
         }
 
-        console.rule()
         if line_profiler_results == "":
             logger.info("No LineProfiler results were provided, Skipping optimization.")
-            console.rule()
             return []
         try:
             response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=60)
@@ -237,10 +231,7 @@ def optimize_python_code_line_profiler(  # noqa: D417
 
         if response.status_code == 200:
             optimizations_json = response.json()["optimizations"]
-            logger.info(
-                f"!lsp|Generated {len(optimizations_json)} candidate optimizations using line profiler information."
-            )
-            console.rule()
+            logger.debug(f"Backend returned {len(optimizations_json)} LP optimization(s)")
             return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP, model=model)
         try:
             error = response.json()["error"]
@@ -248,7 +239,6 @@ def optimize_python_code_line_profiler(  # noqa: D417
             error = response.text
         logger.error(f"Error generating optimized candidates: {response.status_code} - {error}")
         ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error})
-        console.rule()
         return []
 
     def optimize_python_code_multi_model(
@@ -260,32 +250,34 @@ def optimize_python_code_multi_model(
         experiment_metadata: ExperimentMetadata | None = None,
         *,
         is_async: bool = False,
-    ) -> list[OptimizedCandidate]:
+        sequence_offset: int = 0,
+    ) -> tuple[list[OptimizedCandidate], int]:
         """Generate optimizations using multiple models in parallel."""
+        logger.info("Generating optimized candidates…")
+        console.rule()
+
         futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = []
-        call_index = 0
 
+        call_index = 0
         for model_name, num_calls in model_distribution:
             for _ in range(num_calls):
-                call_trace_id = f"{base_trace_id[:-4]}M{call_index:02d}"
+                call_trace_id = f"{base_trace_id[:-3]}0{call_index:02x}"
+                call_sequence = sequence_offset + call_index + 1
                 call_index += 1
-
                 future = multi_model_executor.submit(
                     self.optimize_python_code,
                     source_code,
                     dependency_code,
                     call_trace_id,
-                    num_candidates=1,  # Each call returns 1 candidate
-                    experiment_metadata=experiment_metadata,
+                    experiment_metadata,
                     is_async=is_async,
                     model=model_name,
+                    call_sequence=call_sequence,
                 )
                 futures.append((future, model_name))
 
-        # Wait for all calls to complete
         concurrent.futures.wait([f for f, _ in futures])
 
-        # Collect results
         all_candidates: list[OptimizedCandidate] = []
         for future, model_name in futures:
             try:
@@ -295,7 +287,8 @@ def optimize_python_code_multi_model(
                 logger.warning(f"Model {model_name} call failed: {e}")
                 continue
 
-        return all_candidates
+        console.rule()
+        return all_candidates, call_index
 
     def optimize_python_code_line_profiler_multi_model(
         self,
@@ -305,25 +298,29 @@ def optimize_python_code_line_profiler_multi_model(
         line_profiler_results: str,
         model_distribution: list[tuple[str, int]],
         experiment_metadata: ExperimentMetadata | None = None,
-    ) -> list[OptimizedCandidate]:
+        sequence_offset: int = 0,
+    ) -> tuple[list[OptimizedCandidate], int]:
         """Generate line profiler optimizations using multiple models in parallel."""
+        logger.info("Generating optimized candidates with line profiler…")
+        console.rule()
+
         futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = []
-        call_index = 0
 
+        call_index = 0
         for model_name, num_calls in model_distribution:
             for _ in range(num_calls):
-                call_trace_id = f"{base_trace_id[:-4]}L{call_index:02d}"
+                call_trace_id = f"{base_trace_id[:-3]}1{call_index:02x}"
+                call_sequence = sequence_offset + call_index + 1
                 call_index += 1
-
                 future = multi_model_executor.submit(
                     self.optimize_python_code_line_profiler,
                     source_code,
                     dependency_code,
                     call_trace_id,
                     line_profiler_results,
-                    num_candidates=1,
-                    experiment_metadata=experiment_metadata,
-                    model=model_name,
+                    experiment_metadata,
+                    model_name,
+                    call_sequence,
                 )
                 futures.append((future, model_name))
 
@@ -338,7 +335,8 @@ def optimize_python_code_line_profiler_multi_model(
                 logger.warning(f"Line profiler model {model_name} call failed: {e}")
                 continue
 
-        return all_candidates
+        console.rule()
+        return all_candidates, call_index
 
     def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]) -> list[OptimizedCandidate]:
         """Optimize the given python code for performance by making a request to the Django endpoint.
@@ -366,6 +364,7 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]
                 "trace_id": opt.trace_id,
                 "function_references": opt.function_references,
                 "python_version": platform.python_version(),
+                "call_sequence": opt.call_sequence,
             }
             for opt in request
         ]
@@ -455,6 +454,7 @@ def get_new_explanation(  # noqa: D417
         throughput_improvement: str | None = None,
         function_references: str | None = None,
         codeflash_version: str = codeflash_version,
+        call_sequence: int | None = None,
     ) -> str:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -500,6 +500,7 @@ def get_new_explanation(  # noqa: D417
             "throughput_improvement": throughput_improvement,
             "function_references": function_references,
             "codeflash_version": codeflash_version,
+            "call_sequence": call_sequence,
         }
         logger.info("loading|Generating explanation")
         console.rule()
@@ -627,6 +628,7 @@ def generate_regression_tests(  # noqa: D417
         test_timeout: int,
         trace_id: str,
         test_index: int,
+        call_sequence: int | None = None,
     ) -> tuple[str, str, str] | None:
         """Generate regression tests for the given function by making a request to the Django endpoint.
 
@@ -662,6 +664,7 @@ def generate_regression_tests(  # noqa: D417
             "python_version": platform.python_version(),
             "codeflash_version": codeflash_version,
             "is_async": function_to_optimize.is_async,
+            "call_sequence": call_sequence,
         }
         try:
             response = self.make_ai_service_request("/testgen", payload=payload, timeout=90)
@@ -702,6 +705,7 @@ def get_optimization_review(
         replay_tests: str,
         concolic_tests: str,  # noqa: ARG002
         calling_fn_details: str,
+        call_sequence: int | None = None,
     ) -> str:
         """Compute the optimization review of current Pull Request.
 
@@ -748,6 +752,7 @@ def get_optimization_review(
             "codeflash_version": codeflash_version,
             "calling_fn_details": calling_fn_details,
             "python_version": platform.python_version(),
+            "call_sequence": call_sequence,
         }
         console.rule()
         try:
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
index 4f7553818..822ecffab 100644
--- a/codeflash/models/models.py
+++ b/codeflash/models/models.py
@@ -46,6 +46,7 @@ class AIServiceRefinerRequest:
     original_line_profiler_results: str
     optimized_line_profiler_results: str
     function_references: str | None = None
+    call_sequence: int | None = None
 
 
 class TestDiffScope(str, Enum):
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index afd56519e..e8e51deb7 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -139,6 +139,7 @@ def __init__(
         ai_service_client: AiServiceClient,
         executor: concurrent.futures.ThreadPoolExecutor,
         future_all_code_repair: list[concurrent.futures.Future],
+        sequence_offset: int = 0,
     ) -> None:
         self.candidate_queue = queue.Queue()
         self.line_profiler_done = False
@@ -146,6 +147,9 @@ def __init__(
         self.candidate_len = len(initial_candidates)
         self.ai_service_client = ai_service_client
         self.executor = executor
+        self.sequence_offset = sequence_offset
+        self.lp_calls_count = 0
+        self.refinement_calls_count = 0
 
         # Initialize queue with initial candidates
         for candidate in initial_candidates:
@@ -155,6 +159,9 @@ def __init__(
         self.all_refinements_data = all_refinements_data
         self.future_all_code_repair = future_all_code_repair
 
+    def get_total_llm_calls(self) -> int:
+        return self.sequence_offset + self.lp_calls_count + self.refinement_calls_count
+
     def get_next_candidate(self) -> OptimizedCandidate | None:
         """Get the next candidate from the queue, handling async results as needed."""
         try:
@@ -176,7 +183,11 @@ def _process_line_profiler_results(self) -> OptimizedCandidate | None:
         """Process line profiler results and add to queue."""
         logger.debug("all candidates processed, await candidates from line profiler")
         concurrent.futures.wait([self.future_line_profile_results])
-        line_profile_results = self.future_line_profile_results.result()
+        result = self.future_line_profile_results.result()
+
+        # LP multi-model now returns (candidates, lp_call_count)
+        line_profile_results, lp_call_count = result
+        self.lp_calls_count = lp_call_count
 
         for candidate in line_profile_results:
             self.candidate_queue.put(candidate)
@@ -192,11 +203,18 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur
 
     def _process_refinement_results(self) -> OptimizedCandidate | None:
         """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined."""
+        import dataclasses  # noqa: PLC0415
+
         future_refinements: list[concurrent.futures.Future] = []
+        # Calculate base sequence: offset + lp_calls (refinements come after LP)
+        base_sequence = self.sequence_offset + self.lp_calls_count
+        refinement_call_index = 0
 
         if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD:
             for data in self.all_refinements_data:
-                future_refinements.append(self.refine_optimizations([data]))  # noqa: PERF401
+                refinement_call_index += 1
+                data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index)
+                future_refinements.append(self.refine_optimizations([data_with_seq]))
         else:
             diff_lens_list = []
             runtimes_list = []
@@ -215,8 +233,13 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
             top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates]
 
             for idx in top_indecies:
+                refinement_call_index += 1
                 data = self.all_refinements_data[idx]
-                future_refinements.append(self.refine_optimizations([data]))
+                data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index)
+                future_refinements.append(self.refine_optimizations([data_with_seq]))
+
+        # Track total refinement calls made
+        self.refinement_calls_count = refinement_call_index
 
         if future_refinements:
             logger.info("loading|Refining generated code for improved quality and performance...")
@@ -319,10 +342,14 @@ def __init__(
         self.optimization_review = ""
         self.future_all_code_repair: list[concurrent.futures.Future] = []
         self.repair_counter = 0  # track how many repairs we did for each function
+        self.test_gen_calls_count = 0
+        self.optimize_calls_count = 0
+        self.lp_calls_count = 0
+        self.total_llm_calls = 0
 
     def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]:
         should_run_experiment = self.experiment_id is not None
-        logger.debug(f"Function Trace ID: {self.function_trace_id}")
+        logger.info(f"Function Trace ID: {self.function_trace_id}")
         ph("cli-optimize-function-start", {"function_trace_id": self.function_trace_id})
         self.cleanup_leftover_test_return_values()
         file_name_from_test_module_name.cache_clear()
@@ -921,7 +948,6 @@ def determine_best_candidate(
         ai_service_client = self.aiservice_client if exp_type == "EXP0" else self.local_aiservice_client
         assert ai_service_client is not None, "AI service client must be set for optimization"
 
-        # Use multi-model approach for line profiler optimization
         future_line_profile_results = self.executor.submit(
             ai_service_client.optimize_python_code_line_profiler_multi_model,
             source_code=code_context.read_writable_code.markdown,
@@ -934,6 +960,7 @@ def determine_best_candidate(
             )
             if self.experiment_id
             else None,
+            sequence_offset=self.optimize_calls_count,
         )
 
         processor = CandidateProcessor(
@@ -943,6 +970,7 @@ def determine_best_candidate(
             self.aiservice_client,
             self.executor,
             self.future_all_code_repair,
+            sequence_offset=self.optimize_calls_count,
         )
         candidate_index = 0
 
@@ -976,6 +1004,9 @@ def determine_best_candidate(
                     self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
                 )
 
+        # Track total LLM calls from the processor for sequence numbering
+        self.total_llm_calls = processor.get_total_llm_calls()
+
         # Select and return the best optimization
         best_optimization = self.select_best_optimization(
             eval_ctx=eval_ctx,
@@ -1355,7 +1386,6 @@ def generate_optimizations(
         run_experiment: bool = False,  # noqa: FBT001, FBT002
     ) -> Result[tuple[OptimizationSet, str], str]:
         """Generate optimization candidates for the function using multiple models in parallel."""
-        # Use multi-model approach for diversity
         future_optimization_candidates = self.executor.submit(
             self.aiservice_client.optimize_python_code_multi_model,
             read_writable_code.markdown,
@@ -1364,6 +1394,7 @@ def generate_optimizations(
             MODEL_DISTRIBUTION_EFFECTIVE,
             ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None,
             is_async=self.function_to_optimize.is_async,
+            sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE,
         )
 
         future_references = self.executor.submit(
@@ -1387,20 +1418,26 @@ def generate_optimizations(
                 MODEL_DISTRIBUTION_EFFECTIVE,
                 ExperimentMetadata(id=self.experiment_id, group="experiment"),
                 is_async=self.function_to_optimize.is_async,
+                sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE,
             )
             futures.append(future_candidates_exp)
 
         # Wait for optimization futures to complete
         concurrent.futures.wait(futures)
 
-        # Retrieve results
-        candidates: list[OptimizedCandidate] = future_optimization_candidates.result()
-        logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations from multiple models.")
+        # Retrieve results - optimize_python_code_multi_model returns (candidates, call_count)
+        candidates, optimize_call_count = future_optimization_candidates.result()
+        # Total sequence count = test gen calls + optimization calls (LP will continue from here)
+        self.optimize_calls_count = N_TESTS_TO_GENERATE_EFFECTIVE + optimize_call_count
+        logger.info(f"!lsp|Completed {optimize_call_count} optimization calls, got {len(candidates)} candidates.")
 
         if not candidates:
             return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}")
 
-        candidates_experiment = future_candidates_exp.result() if future_candidates_exp else None
+        # Handle experiment results - also returns (candidates, call_count) tuple
+        candidates_experiment = None
+        if future_candidates_exp:
+            candidates_experiment, _ = future_candidates_exp.result()
         function_references = future_references.result()
 
         return Success((OptimizationSet(control=candidates, experiment=candidates_experiment), function_references))
@@ -1647,6 +1684,10 @@ def process_review(
             )
             throughput_improvement_str = f"{throughput_improvement_value * 100:.1f}%"
 
+        # Explanation call continues the sequence numbering
+        explanation_call_sequence = self.total_llm_calls + 1
+        self.total_llm_calls = explanation_call_sequence
+
         new_explanation_raw_str = self.aiservice_client.get_new_explanation(
             source_code=code_context.read_writable_code.flat,
             dependency_code=code_context.read_only_context_code,
@@ -1664,6 +1705,7 @@ def process_review(
             optimized_throughput=optimized_throughput_str,
             throughput_improvement=throughput_improvement_str,
             function_references=function_references,
+            call_sequence=explanation_call_sequence,
         )
         new_explanation = Explanation(
             raw_explanation_message=new_explanation_raw_str or explanation.raw_explanation_message,
@@ -1699,9 +1741,13 @@ def process_review(
         staging_review = self.args.staging_review
         opt_review_response = ""
         # this will now run regardless of pr, staging review flags
+        # Optimization review call continues the sequence numbering
+        review_call_sequence = self.total_llm_calls + 1
+        self.total_llm_calls = review_call_sequence
+
         try:
             opt_review_response = self.aiservice_client.get_optimization_review(
-                **data, calling_fn_details=function_references
+                **data, calling_fn_details=function_references, call_sequence=review_call_sequence
             )
         except Exception as e:
             logger.debug(f"optimization review response failed, investigate {e}")
@@ -2192,6 +2238,9 @@ def submit_test_generation_tasks(
         generated_test_paths: list[Path],
         generated_perf_test_paths: list[Path],
     ) -> list[concurrent.futures.Future]:
+        # Track how many test generation calls we're making for sequence numbering
+        self.test_gen_calls_count = len(generated_test_paths)
+
         return [
             executor.submit(
                 generate_tests,
@@ -2206,6 +2255,7 @@ def submit_test_generation_tasks(
                 test_index,
                 test_path,
                 test_perf_path,
+                call_sequence=test_index + 1,
             )
             for test_index, (test_path, test_perf_path) in enumerate(
                 zip(generated_test_paths, generated_perf_test_paths)
diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py
index 8d187f2b1..d94455df3 100644
--- a/codeflash/verification/verifier.py
+++ b/codeflash/verification/verifier.py
@@ -27,6 +27,7 @@ def generate_tests(
     test_index: int,
     test_path: Path,
     test_perf_path: Path,
+    call_sequence: int | None = None,
 ) -> tuple[str, str, Path] | None:
     # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original
     #  class import. Remove the recreation of the class definition
@@ -42,6 +43,7 @@ def generate_tests(
         test_timeout=test_timeout,
         trace_id=function_trace_id,
         test_index=test_index,
+        call_sequence=call_sequence,
     )
     if response and isinstance(response, tuple) and len(response) == 3:
         generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response

From 5a122a99fb44f6b7a82c95afba9c5a8c716d0b04 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 23 Dec 2025 18:34:37 -0500
Subject: [PATCH 04/11] pre-commit changes

---
 codeflash/api/aiservice.py                   | 1 +
 codeflash/optimization/function_optimizer.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index e233e6a71..fff3611fd 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -721,6 +721,7 @@ def get_optimization_review(
         root_dir: Path -> path of git directory
         concolic_tests: str -> concolic_tests (not used)
         calling_fn_details: str -> filenames and definitions of functions which call the function_to_optimize
+        call_sequence: int | None -> sequence number for multi-model calls
 
         Returns:
         -------
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index e8e51deb7..dfafb86a5 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -203,7 +203,7 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur
 
     def _process_refinement_results(self) -> OptimizedCandidate | None:
         """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined."""
-        import dataclasses  # noqa: PLC0415
+        import dataclasses
 
         future_refinements: list[concurrent.futures.Future] = []
         # Calculate base sequence: offset + lp_calls (refinements come after LP)

From 1b6e046553ba97c093a13ae59c862d000e685a13 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 23 Dec 2025 18:56:10 -0500
Subject: [PATCH 05/11] have the client manage the TPE

---
 codeflash/api/aiservice.py                   | 9 ++++-----
 codeflash/optimization/function_optimizer.py | 3 +++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index fff3611fd..876ee74c5 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -35,9 +35,6 @@
     from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest
     from codeflash.result.explanation import Explanation
 
-multi_model_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10, thread_name_prefix="multi_model")
-
-
 class AiServiceClient:
     def __init__(self) -> None:
         self.base_url = self.get_aiservice_base_url()
@@ -251,6 +248,7 @@ def optimize_python_code_multi_model(
         *,
         is_async: bool = False,
         sequence_offset: int = 0,
+        executor: concurrent.futures.ThreadPoolExecutor | None = None,
     ) -> tuple[list[OptimizedCandidate], int]:
         """Generate optimizations using multiple models in parallel."""
         logger.info("Generating optimized candidates…")
@@ -264,7 +262,7 @@ def optimize_python_code_multi_model(
                 call_trace_id = f"{base_trace_id[:-3]}0{call_index:02x}"
                 call_sequence = sequence_offset + call_index + 1
                 call_index += 1
-                future = multi_model_executor.submit(
+                future = executor.submit(
                     self.optimize_python_code,
                     source_code,
                     dependency_code,
@@ -299,6 +297,7 @@ def optimize_python_code_line_profiler_multi_model(
         model_distribution: list[tuple[str, int]],
         experiment_metadata: ExperimentMetadata | None = None,
         sequence_offset: int = 0,
+        executor: concurrent.futures.ThreadPoolExecutor | None = None,
     ) -> tuple[list[OptimizedCandidate], int]:
         """Generate line profiler optimizations using multiple models in parallel."""
         logger.info("Generating optimized candidates with line profiler…")
@@ -312,7 +311,7 @@ def optimize_python_code_line_profiler_multi_model(
                 call_trace_id = f"{base_trace_id[:-3]}1{call_index:02x}"
                 call_sequence = sequence_offset + call_index + 1
                 call_index += 1
-                future = multi_model_executor.submit(
+                future = executor.submit(
                     self.optimize_python_code_line_profiler,
                     source_code,
                     dependency_code,
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index dfafb86a5..b29586b96 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -961,6 +961,7 @@ def determine_best_candidate(
             if self.experiment_id
             else None,
             sequence_offset=self.optimize_calls_count,
+            executor=self.executor,
         )
 
         processor = CandidateProcessor(
@@ -1395,6 +1396,7 @@ def generate_optimizations(
             ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None,
             is_async=self.function_to_optimize.is_async,
             sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE,
+            executor=self.executor,
         )
 
         future_references = self.executor.submit(
@@ -1419,6 +1421,7 @@ def generate_optimizations(
                 ExperimentMetadata(id=self.experiment_id, group="experiment"),
                 is_async=self.function_to_optimize.is_async,
                 sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE,
+                executor=self.executor,
             )
             futures.append(future_candidates_exp)
 

From 1c6e9513faa31df7ef57adcb0383fac03819019e Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 23 Dec 2025 18:58:42 -0500
Subject: [PATCH 06/11] we should always have an executor

---
 codeflash/api/aiservice.py                   | 5 +++--
 codeflash/optimization/function_optimizer.py | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index 876ee74c5..7480252bd 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -35,6 +35,7 @@
     from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest
     from codeflash.result.explanation import Explanation
 
+
 class AiServiceClient:
     def __init__(self) -> None:
         self.base_url = self.get_aiservice_base_url()
@@ -244,11 +245,11 @@ def optimize_python_code_multi_model(
         dependency_code: str,
         base_trace_id: str,
         model_distribution: list[tuple[str, int]],
+        executor: concurrent.futures.ThreadPoolExecutor,
         experiment_metadata: ExperimentMetadata | None = None,
         *,
         is_async: bool = False,
         sequence_offset: int = 0,
-        executor: concurrent.futures.ThreadPoolExecutor | None = None,
     ) -> tuple[list[OptimizedCandidate], int]:
         """Generate optimizations using multiple models in parallel."""
         logger.info("Generating optimized candidates…")
@@ -295,9 +296,9 @@ def optimize_python_code_line_profiler_multi_model(
         base_trace_id: str,
         line_profiler_results: str,
         model_distribution: list[tuple[str, int]],
+        executor: concurrent.futures.ThreadPoolExecutor,
         experiment_metadata: ExperimentMetadata | None = None,
         sequence_offset: int = 0,
-        executor: concurrent.futures.ThreadPoolExecutor | None = None,
     ) -> tuple[list[OptimizedCandidate], int]:
         """Generate line profiler optimizations using multiple models in parallel."""
         logger.info("Generating optimized candidates with line profiler…")
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index b29586b96..6228ee01a 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -955,13 +955,13 @@ def determine_best_candidate(
             base_trace_id=self.get_trace_id(exp_type),
             line_profiler_results=original_code_baseline.line_profile_results["str_out"],
             model_distribution=MODEL_DISTRIBUTION_LP_EFFECTIVE,
+            executor=self.executor,
             experiment_metadata=ExperimentMetadata(
                 id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment"
             )
             if self.experiment_id
             else None,
             sequence_offset=self.optimize_calls_count,
-            executor=self.executor,
         )
 
         processor = CandidateProcessor(
@@ -1393,10 +1393,10 @@ def generate_optimizations(
             read_only_context_code,
             self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id,
             MODEL_DISTRIBUTION_EFFECTIVE,
+            self.executor,
             ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None,
             is_async=self.function_to_optimize.is_async,
             sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE,
-            executor=self.executor,
         )
 
         future_references = self.executor.submit(
@@ -1418,10 +1418,10 @@ def generate_optimizations(
                 read_only_context_code,
                 self.function_trace_id[:-4] + "EXP1",
                 MODEL_DISTRIBUTION_EFFECTIVE,
+                self.executor,
                 ExperimentMetadata(id=self.experiment_id, group="experiment"),
                 is_async=self.function_to_optimize.is_async,
                 sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE,
-                executor=self.executor,
             )
             futures.append(future_candidates_exp)
 

From bbb15e78379bf04a0cd22cff04f679d7e396584f Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Fri, 26 Dec 2025 11:57:30 -0500
Subject: [PATCH 07/11] do this elsewhere

---
 codeflash/api/aiservice.py                   | 153 ++++---------------
 codeflash/code_utils/config_consts.py        |  16 --
 codeflash/optimization/function_optimizer.py |  79 +++-------
 3 files changed, 50 insertions(+), 198 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index 7480252bd..e858726b1 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import concurrent.futures
 import json
 import os
 import platform
@@ -92,7 +91,7 @@ def make_ai_service_request(
         return response
 
     def _get_valid_candidates(
-        self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource, model: str | None = None
+        self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource
     ) -> list[OptimizedCandidate]:
         candidates: list[OptimizedCandidate] = []
         for opt in optimizations_json:
@@ -106,7 +105,7 @@ def _get_valid_candidates(
                     optimization_id=opt["optimization_id"],
                     source=source,
                     parent_id=opt.get("parent_id", None),
-                    model=model,
+                    model=opt.get("model"),
                 )
             )
         return candidates
@@ -119,8 +118,6 @@ def optimize_python_code(  # noqa: D417
         experiment_metadata: ExperimentMetadata | None = None,
         *,
         is_async: bool = False,
-        model: str | None = None,
-        call_sequence: int | None = None,
     ) -> list[OptimizedCandidate]:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -130,14 +127,15 @@ def optimize_python_code(  # noqa: D417
         - dependency_code (str): The dependency code used as read-only context for the optimization
         - trace_id (str): Trace id of optimization run
         - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization
-        - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default).
-        - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None.
+        - is_async (bool): Whether the function being optimized is async
 
         Returns
         -------
         - List[OptimizationCandidate]: A list of Optimization Candidates.
 
         """
+        logger.info("Generating optimized candidates…")
+        console.rule()
         start_time = time.perf_counter()
         git_repo_owner, git_repo_name = safe_get_repo_owner_and_name()
 
@@ -152,30 +150,32 @@ def optimize_python_code(  # noqa: D417
             "repo_owner": git_repo_owner,
             "repo_name": git_repo_name,
             "is_async": is_async,
-            "model": model,
-            "call_sequence": call_sequence,
+            "lsp_mode": is_LSP_enabled(),
         }
-        logger.debug(f"Sending optimize request: model={model}, trace_id={trace_id}, call_sequence={call_sequence}")
+        logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}")
 
         try:
-            response = self.make_ai_service_request("/optimize", payload=payload, timeout=60)
+            response = self.make_ai_service_request("/optimize", payload=payload, timeout=120)
         except requests.exceptions.RequestException as e:
             logger.exception(f"Error generating optimized candidates: {e}")
             ph("cli-optimize-error-caught", {"error": str(e)})
+            console.rule()
             return []
 
         if response.status_code == 200:
             optimizations_json = response.json()["optimizations"]
             end_time = time.perf_counter()
             logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.")
-            logger.debug(f"Backend returned {len(optimizations_json)} optimization(s)")
-            return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE, model=model)
+            logger.info(f"!lsp|Received {len(optimizations_json)} optimization candidates.")
+            console.rule()
+            return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE)
         try:
             error = response.json()["error"]
         except Exception:
             error = response.text
         logger.error(f"Error generating optimized candidates: {response.status_code} - {error}")
         ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error})
+        console.rule()
         return []
 
     def optimize_python_code_line_profiler(  # noqa: D417
@@ -185,25 +185,29 @@ def optimize_python_code_line_profiler(  # noqa: D417
         trace_id: str,
         line_profiler_results: str,
         experiment_metadata: ExperimentMetadata | None = None,
-        model: str | None = None,
-        call_sequence: int | None = None,
     ) -> list[OptimizedCandidate]:
-        """Optimize the given python code for performance by making a request to the Django endpoint.
+        """Optimize the given python code for performance using line profiler results.
 
         Parameters
         ----------
         - source_code (str): The python code to optimize.
         - dependency_code (str): The dependency code used as read-only context for the optimization
         - trace_id (str): Trace id of optimization run
+        - line_profiler_results (str): Line profiler output to guide optimization
         - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization
-        - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default).
-        - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None.
 
         Returns
         -------
         - List[OptimizationCandidate]: A list of Optimization Candidates.
 
         """
+        if line_profiler_results == "":
+            logger.info("No LineProfiler results were provided, Skipping optimization.")
+            return []
+
+        logger.info("Generating optimized candidates with line profiler…")
+        console.rule()
+
         payload = {
             "source_code": source_code,
             "dependency_code": dependency_code,
@@ -213,130 +217,29 @@ def optimize_python_code_line_profiler(  # noqa: D417
             "experiment_metadata": experiment_metadata,
             "codeflash_version": codeflash_version,
             "lsp_mode": is_LSP_enabled(),
-            "model": model,
-            "call_sequence": call_sequence,
         }
 
-        if line_profiler_results == "":
-            logger.info("No LineProfiler results were provided, Skipping optimization.")
-            return []
         try:
-            response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=60)
+            response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=120)
         except requests.exceptions.RequestException as e:
             logger.exception(f"Error generating optimized candidates: {e}")
             ph("cli-optimize-error-caught", {"error": str(e)})
+            console.rule()
             return []
 
         if response.status_code == 200:
             optimizations_json = response.json()["optimizations"]
-            logger.debug(f"Backend returned {len(optimizations_json)} LP optimization(s)")
-            return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP, model=model)
+            logger.info(f"!lsp|Received {len(optimizations_json)} line profiler optimization candidates.")
+            console.rule()
+            return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP)
         try:
             error = response.json()["error"]
         except Exception:
             error = response.text
         logger.error(f"Error generating optimized candidates: {response.status_code} - {error}")
         ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error})
-        return []
-
-    def optimize_python_code_multi_model(
-        self,
-        source_code: str,
-        dependency_code: str,
-        base_trace_id: str,
-        model_distribution: list[tuple[str, int]],
-        executor: concurrent.futures.ThreadPoolExecutor,
-        experiment_metadata: ExperimentMetadata | None = None,
-        *,
-        is_async: bool = False,
-        sequence_offset: int = 0,
-    ) -> tuple[list[OptimizedCandidate], int]:
-        """Generate optimizations using multiple models in parallel."""
-        logger.info("Generating optimized candidates…")
-        console.rule()
-
-        futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = []
-
-        call_index = 0
-        for model_name, num_calls in model_distribution:
-            for _ in range(num_calls):
-                call_trace_id = f"{base_trace_id[:-3]}0{call_index:02x}"
-                call_sequence = sequence_offset + call_index + 1
-                call_index += 1
-                future = executor.submit(
-                    self.optimize_python_code,
-                    source_code,
-                    dependency_code,
-                    call_trace_id,
-                    experiment_metadata,
-                    is_async=is_async,
-                    model=model_name,
-                    call_sequence=call_sequence,
-                )
-                futures.append((future, model_name))
-
-        concurrent.futures.wait([f for f, _ in futures])
-
-        all_candidates: list[OptimizedCandidate] = []
-        for future, model_name in futures:
-            try:
-                candidates = future.result()
-                all_candidates.extend(candidates)
-            except Exception as e:
-                logger.warning(f"Model {model_name} call failed: {e}")
-                continue
-
         console.rule()
-        return all_candidates, call_index
-
-    def optimize_python_code_line_profiler_multi_model(
-        self,
-        source_code: str,
-        dependency_code: str,
-        base_trace_id: str,
-        line_profiler_results: str,
-        model_distribution: list[tuple[str, int]],
-        executor: concurrent.futures.ThreadPoolExecutor,
-        experiment_metadata: ExperimentMetadata | None = None,
-        sequence_offset: int = 0,
-    ) -> tuple[list[OptimizedCandidate], int]:
-        """Generate line profiler optimizations using multiple models in parallel."""
-        logger.info("Generating optimized candidates with line profiler…")
-        console.rule()
-
-        futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = []
-
-        call_index = 0
-        for model_name, num_calls in model_distribution:
-            for _ in range(num_calls):
-                call_trace_id = f"{base_trace_id[:-3]}1{call_index:02x}"
-                call_sequence = sequence_offset + call_index + 1
-                call_index += 1
-                future = executor.submit(
-                    self.optimize_python_code_line_profiler,
-                    source_code,
-                    dependency_code,
-                    call_trace_id,
-                    line_profiler_results,
-                    experiment_metadata,
-                    model_name,
-                    call_sequence,
-                )
-                futures.append((future, model_name))
-
-        concurrent.futures.wait([f for f, _ in futures])
-
-        all_candidates: list[OptimizedCandidate] = []
-        for future, model_name in futures:
-            try:
-                candidates = future.result()
-                all_candidates.extend(candidates)
-            except Exception as e:
-                logger.warning(f"Line profiler model {model_name} call failed: {e}")
-                continue
-
-        console.rule()
-        return all_candidates, call_index
+        return []
 
     def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]) -> list[OptimizedCandidate]:
         """Optimize the given python code for performance by making a request to the Django endpoint.
diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
index ba09989f8..88758455e 100644
--- a/codeflash/code_utils/config_consts.py
+++ b/codeflash/code_utils/config_consts.py
@@ -32,20 +32,6 @@
 MAX_N_CANDIDATES = 5
 MAX_N_CANDIDATES_LP = 6
 
-# Multi-model diversity configuration
-# Each tuple is (model_name, num_calls) where each call returns 1 candidate
-# Standard mode: 3 GPT-4.1 + 2 Claude Sonnet = 5 candidates
-MODEL_DISTRIBUTION: list[tuple[str, int]] = [("gpt-4.1", 3), ("claude-sonnet-4-5", 2)]
-
-# LSP mode: fewer candidates for faster response
-MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)]
-
-# Line profiler mode: 6 candidates total
-MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [("gpt-4.1", 4), ("claude-sonnet-4-5", 2)]
-
-# Line profiler LSP mode
-MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)]
-
 try:
     from codeflash.lsp.helpers import is_LSP_enabled
 
@@ -57,7 +43,5 @@
 N_CANDIDATES_LP_EFFECTIVE = min(N_CANDIDATES_LP_LSP if _IS_LSP_ENABLED else N_CANDIDATES_LP, MAX_N_CANDIDATES_LP)
 N_TESTS_TO_GENERATE_EFFECTIVE = N_TESTS_TO_GENERATE_LSP if _IS_LSP_ENABLED else N_TESTS_TO_GENERATE
 TOTAL_LOOPING_TIME_EFFECTIVE = TOTAL_LOOPING_TIME_LSP if _IS_LSP_ENABLED else TOTAL_LOOPING_TIME
-MODEL_DISTRIBUTION_EFFECTIVE = MODEL_DISTRIBUTION_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION
-MODEL_DISTRIBUTION_LP_EFFECTIVE = MODEL_DISTRIBUTION_LP_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION_LP
 
 MAX_CONTEXT_LEN_REVIEW = 1000
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 6228ee01a..138fe9424 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -46,8 +46,6 @@
     COVERAGE_THRESHOLD,
     INDIVIDUAL_TESTCASE_TIMEOUT,
     MAX_REPAIRS_PER_TRACE,
-    MODEL_DISTRIBUTION_EFFECTIVE,
-    MODEL_DISTRIBUTION_LP_EFFECTIVE,
     N_TESTS_TO_GENERATE_EFFECTIVE,
     REFINE_ALL_THRESHOLD,
     REFINED_CANDIDATE_RANKING_WEIGHTS,
@@ -139,7 +137,6 @@ def __init__(
         ai_service_client: AiServiceClient,
         executor: concurrent.futures.ThreadPoolExecutor,
         future_all_code_repair: list[concurrent.futures.Future],
-        sequence_offset: int = 0,
     ) -> None:
         self.candidate_queue = queue.Queue()
         self.line_profiler_done = False
@@ -147,8 +144,6 @@ def __init__(
         self.candidate_len = len(initial_candidates)
         self.ai_service_client = ai_service_client
         self.executor = executor
-        self.sequence_offset = sequence_offset
-        self.lp_calls_count = 0
         self.refinement_calls_count = 0
 
         # Initialize queue with initial candidates
@@ -160,7 +155,7 @@ def __init__(
         self.future_all_code_repair = future_all_code_repair
 
     def get_total_llm_calls(self) -> int:
-        return self.sequence_offset + self.lp_calls_count + self.refinement_calls_count
+        return self.refinement_calls_count
 
     def get_next_candidate(self) -> OptimizedCandidate | None:
         """Get the next candidate from the queue, handling async results as needed."""
@@ -183,11 +178,7 @@ def _process_line_profiler_results(self) -> OptimizedCandidate | None:
         """Process line profiler results and add to queue."""
         logger.debug("all candidates processed, await candidates from line profiler")
         concurrent.futures.wait([self.future_line_profile_results])
-        result = self.future_line_profile_results.result()
-
-        # LP multi-model now returns (candidates, lp_call_count)
-        line_profile_results, lp_call_count = result
-        self.lp_calls_count = lp_call_count
+        line_profile_results = self.future_line_profile_results.result()
 
         for candidate in line_profile_results:
             self.candidate_queue.put(candidate)
@@ -203,18 +194,13 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur
 
     def _process_refinement_results(self) -> OptimizedCandidate | None:
         """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined."""
-        import dataclasses
-
         future_refinements: list[concurrent.futures.Future] = []
-        # Calculate base sequence: offset + lp_calls (refinements come after LP)
-        base_sequence = self.sequence_offset + self.lp_calls_count
         refinement_call_index = 0
 
         if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD:
             for data in self.all_refinements_data:
                 refinement_call_index += 1
-                data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index)
-                future_refinements.append(self.refine_optimizations([data_with_seq]))
+                future_refinements.append(self.refine_optimizations([data]))
         else:
             diff_lens_list = []
             runtimes_list = []
@@ -235,8 +221,7 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
             for idx in top_indecies:
                 refinement_call_index += 1
                 data = self.all_refinements_data[idx]
-                data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index)
-                future_refinements.append(self.refine_optimizations([data_with_seq]))
+                future_refinements.append(self.refine_optimizations([data]))
 
         # Track total refinement calls made
         self.refinement_calls_count = refinement_call_index
@@ -342,10 +327,8 @@ def __init__(
         self.optimization_review = ""
         self.future_all_code_repair: list[concurrent.futures.Future] = []
         self.repair_counter = 0  # track how many repairs we did for each function
-        self.test_gen_calls_count = 0
-        self.optimize_calls_count = 0
-        self.lp_calls_count = 0
-        self.total_llm_calls = 0
+        # Counter for post-optimization LLM calls (explanation, review) - optimization calls are handled by backend
+        self.post_optimization_call_count = 0
 
     def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]:
         should_run_experiment = self.experiment_id is not None
@@ -949,19 +932,16 @@ def determine_best_candidate(
         assert ai_service_client is not None, "AI service client must be set for optimization"
 
         future_line_profile_results = self.executor.submit(
-            ai_service_client.optimize_python_code_line_profiler_multi_model,
+            ai_service_client.optimize_python_code_line_profiler,
             source_code=code_context.read_writable_code.markdown,
             dependency_code=code_context.read_only_context_code,
-            base_trace_id=self.get_trace_id(exp_type),
+            trace_id=self.get_trace_id(exp_type),
             line_profiler_results=original_code_baseline.line_profile_results["str_out"],
-            model_distribution=MODEL_DISTRIBUTION_LP_EFFECTIVE,
-            executor=self.executor,
             experiment_metadata=ExperimentMetadata(
                 id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment"
             )
             if self.experiment_id
             else None,
-            sequence_offset=self.optimize_calls_count,
         )
 
         processor = CandidateProcessor(
@@ -971,7 +951,6 @@ def determine_best_candidate(
             self.aiservice_client,
             self.executor,
             self.future_all_code_repair,
-            sequence_offset=self.optimize_calls_count,
         )
         candidate_index = 0
 
@@ -1005,9 +984,6 @@ def determine_best_candidate(
                     self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
                 )
 
-        # Track total LLM calls from the processor for sequence numbering
-        self.total_llm_calls = processor.get_total_llm_calls()
-
         # Select and return the best optimization
         best_optimization = self.select_best_optimization(
             eval_ctx=eval_ctx,
@@ -1386,17 +1362,14 @@ def generate_optimizations(
         read_only_context_code: str,
         run_experiment: bool = False,  # noqa: FBT001, FBT002
     ) -> Result[tuple[OptimizationSet, str], str]:
-        """Generate optimization candidates for the function using multiple models in parallel."""
+        """Generate optimization candidates for the function. Backend handles multi-model diversity."""
         future_optimization_candidates = self.executor.submit(
-            self.aiservice_client.optimize_python_code_multi_model,
+            self.aiservice_client.optimize_python_code,
             read_writable_code.markdown,
             read_only_context_code,
             self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id,
-            MODEL_DISTRIBUTION_EFFECTIVE,
-            self.executor,
             ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None,
             is_async=self.function_to_optimize.is_async,
-            sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE,
         )
 
         future_references = self.executor.submit(
@@ -1413,34 +1386,29 @@ def generate_optimizations(
 
         if run_experiment:
             future_candidates_exp = self.executor.submit(
-                self.local_aiservice_client.optimize_python_code_multi_model,
+                self.local_aiservice_client.optimize_python_code,
                 read_writable_code.markdown,
                 read_only_context_code,
                 self.function_trace_id[:-4] + "EXP1",
-                MODEL_DISTRIBUTION_EFFECTIVE,
-                self.executor,
                 ExperimentMetadata(id=self.experiment_id, group="experiment"),
                 is_async=self.function_to_optimize.is_async,
-                sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE,
             )
             futures.append(future_candidates_exp)
 
         # Wait for optimization futures to complete
         concurrent.futures.wait(futures)
 
-        # Retrieve results - optimize_python_code_multi_model returns (candidates, call_count)
-        candidates, optimize_call_count = future_optimization_candidates.result()
-        # Total sequence count = test gen calls + optimization calls (LP will continue from here)
-        self.optimize_calls_count = N_TESTS_TO_GENERATE_EFFECTIVE + optimize_call_count
-        logger.info(f"!lsp|Completed {optimize_call_count} optimization calls, got {len(candidates)} candidates.")
+        # Retrieve results - optimize_python_code returns list of candidates
+        candidates = future_optimization_candidates.result()
+        logger.info(f"!lsp|Received {len(candidates)} optimization candidates.")
 
         if not candidates:
             return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}")
 
-        # Handle experiment results - also returns (candidates, call_count) tuple
+        # Handle experiment results
         candidates_experiment = None
         if future_candidates_exp:
-            candidates_experiment, _ = future_candidates_exp.result()
+            candidates_experiment = future_candidates_exp.result()
         function_references = future_references.result()
 
         return Success((OptimizationSet(control=candidates, experiment=candidates_experiment), function_references))
@@ -1687,9 +1655,9 @@ def process_review(
             )
             throughput_improvement_str = f"{throughput_improvement_value * 100:.1f}%"
 
-        # Explanation call continues the sequence numbering
-        explanation_call_sequence = self.total_llm_calls + 1
-        self.total_llm_calls = explanation_call_sequence
+        # Explanation call sequence for tracking
+        self.post_optimization_call_count += 1
+        explanation_call_sequence = self.post_optimization_call_count
 
         new_explanation_raw_str = self.aiservice_client.get_new_explanation(
             source_code=code_context.read_writable_code.flat,
@@ -1744,9 +1712,9 @@ def process_review(
         staging_review = self.args.staging_review
         opt_review_response = ""
         # this will now run regardless of pr, staging review flags
-        # Optimization review call continues the sequence numbering
-        review_call_sequence = self.total_llm_calls + 1
-        self.total_llm_calls = review_call_sequence
+        # Review call sequence for tracking
+        self.post_optimization_call_count += 1
+        review_call_sequence = self.post_optimization_call_count
 
         try:
             opt_review_response = self.aiservice_client.get_optimization_review(
@@ -2241,9 +2209,6 @@ def submit_test_generation_tasks(
         generated_test_paths: list[Path],
         generated_perf_test_paths: list[Path],
     ) -> list[concurrent.futures.Future]:
-        # Track how many test generation calls we're making for sequence numbering
-        self.test_gen_calls_count = len(generated_test_paths)
-
         return [
             executor.submit(
                 generate_tests,

From 04ff30a0e7e3e88cd8cce6817ca4a73ad96ef8a1 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Fri, 26 Dec 2025 16:06:35 -0500
Subject: [PATCH 08/11] fix call sequence

---
 codeflash/api/aiservice.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index e858726b1..a65e0e947 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -151,6 +151,7 @@ def optimize_python_code(  # noqa: D417
             "repo_name": git_repo_name,
             "is_async": is_async,
             "lsp_mode": is_LSP_enabled(),
+            "call_sequence": 1,
         }
         logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}")
 
@@ -217,6 +218,7 @@ def optimize_python_code_line_profiler(  # noqa: D417
             "experiment_metadata": experiment_metadata,
             "codeflash_version": codeflash_version,
             "lsp_mode": is_LSP_enabled(),
+            "call_sequence": 1,
         }
 
         try:

From 9c03391c5d385cc3af8eb1c05e43bd5a8509f51d Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Fri, 26 Dec 2025 17:20:42 -0500
Subject: [PATCH 09/11] count properly

---
 codeflash/api/aiservice.py                   | 22 +++++++++++---------
 codeflash/optimization/function_optimizer.py | 14 +------------
 codeflash/verification/verifier.py           |  2 --
 3 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index a65e0e947..df279fd6f 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -4,6 +4,7 @@
 import os
 import platform
 import time
+from itertools import count
 from typing import TYPE_CHECKING, Any, cast
 
 import requests
@@ -39,6 +40,11 @@ class AiServiceClient:
     def __init__(self) -> None:
         self.base_url = self.get_aiservice_base_url()
         self.headers = {"Authorization": f"Bearer {get_codeflash_api_key()}", "Connection": "close"}
+        self.llm_call_counter = count(1)
+
+    def get_next_sequence(self) -> int:
+        """Get the next LLM call sequence number."""
+        return next(self.llm_call_counter)
 
     def get_aiservice_base_url(self) -> str:
         if os.environ.get("CODEFLASH_AIS_SERVER", default="prod").lower() == "local":
@@ -151,7 +157,7 @@ def optimize_python_code(  # noqa: D417
             "repo_name": git_repo_name,
             "is_async": is_async,
             "lsp_mode": is_LSP_enabled(),
-            "call_sequence": 1,
+            "call_sequence": self.get_next_sequence(),
         }
         logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}")
 
@@ -218,7 +224,7 @@ def optimize_python_code_line_profiler(  # noqa: D417
             "experiment_metadata": experiment_metadata,
             "codeflash_version": codeflash_version,
             "lsp_mode": is_LSP_enabled(),
-            "call_sequence": 1,
+            "call_sequence": self.get_next_sequence(),
         }
 
         try:
@@ -269,7 +275,7 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]
                 "trace_id": opt.trace_id,
                 "function_references": opt.function_references,
                 "python_version": platform.python_version(),
-                "call_sequence": opt.call_sequence,
+                "call_sequence": self.get_next_sequence(),
             }
             for opt in request
         ]
@@ -359,7 +365,6 @@ def get_new_explanation(  # noqa: D417
         throughput_improvement: str | None = None,
         function_references: str | None = None,
         codeflash_version: str = codeflash_version,
-        call_sequence: int | None = None,
     ) -> str:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -405,7 +410,7 @@ def get_new_explanation(  # noqa: D417
             "throughput_improvement": throughput_improvement,
             "function_references": function_references,
             "codeflash_version": codeflash_version,
-            "call_sequence": call_sequence,
+            "call_sequence": self.get_next_sequence(),
         }
         logger.info("loading|Generating explanation")
         console.rule()
@@ -533,7 +538,6 @@ def generate_regression_tests(  # noqa: D417
         test_timeout: int,
         trace_id: str,
         test_index: int,
-        call_sequence: int | None = None,
     ) -> tuple[str, str, str] | None:
         """Generate regression tests for the given function by making a request to the Django endpoint.
 
@@ -569,7 +573,7 @@ def generate_regression_tests(  # noqa: D417
             "python_version": platform.python_version(),
             "codeflash_version": codeflash_version,
             "is_async": function_to_optimize.is_async,
-            "call_sequence": call_sequence,
+            "call_sequence": self.get_next_sequence(),
         }
         try:
             response = self.make_ai_service_request("/testgen", payload=payload, timeout=90)
@@ -610,7 +614,6 @@ def get_optimization_review(
         replay_tests: str,
         concolic_tests: str,  # noqa: ARG002
         calling_fn_details: str,
-        call_sequence: int | None = None,
     ) -> str:
         """Compute the optimization review of current Pull Request.
 
@@ -626,7 +629,6 @@ def get_optimization_review(
         root_dir: Path -> path of git directory
         concolic_tests: str -> concolic_tests (not used)
         calling_fn_details: str -> filenames and definitions of functions which call the function_to_optimize
-        call_sequence: int | None -> sequence number for multi-model calls
 
         Returns:
         -------
@@ -658,7 +660,7 @@ def get_optimization_review(
             "codeflash_version": codeflash_version,
             "calling_fn_details": calling_fn_details,
             "python_version": platform.python_version(),
-            "call_sequence": call_sequence,
+            "call_sequence": self.get_next_sequence(),
         }
         console.rule()
         try:
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 138fe9424..45adf8d44 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -327,8 +327,6 @@ def __init__(
         self.optimization_review = ""
         self.future_all_code_repair: list[concurrent.futures.Future] = []
         self.repair_counter = 0  # track how many repairs we did for each function
-        # Counter for post-optimization LLM calls (explanation, review) - optimization calls are handled by backend
-        self.post_optimization_call_count = 0
 
     def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]:
         should_run_experiment = self.experiment_id is not None
@@ -1655,10 +1653,6 @@ def process_review(
             )
             throughput_improvement_str = f"{throughput_improvement_value * 100:.1f}%"
 
-        # Explanation call sequence for tracking
-        self.post_optimization_call_count += 1
-        explanation_call_sequence = self.post_optimization_call_count
-
         new_explanation_raw_str = self.aiservice_client.get_new_explanation(
             source_code=code_context.read_writable_code.flat,
             dependency_code=code_context.read_only_context_code,
@@ -1676,7 +1670,6 @@ def process_review(
             optimized_throughput=optimized_throughput_str,
             throughput_improvement=throughput_improvement_str,
             function_references=function_references,
-            call_sequence=explanation_call_sequence,
         )
         new_explanation = Explanation(
             raw_explanation_message=new_explanation_raw_str or explanation.raw_explanation_message,
@@ -1712,13 +1705,9 @@ def process_review(
         staging_review = self.args.staging_review
         opt_review_response = ""
         # this will now run regardless of pr, staging review flags
-        # Review call sequence for tracking
-        self.post_optimization_call_count += 1
-        review_call_sequence = self.post_optimization_call_count
-
         try:
             opt_review_response = self.aiservice_client.get_optimization_review(
-                **data, calling_fn_details=function_references, call_sequence=review_call_sequence
+                **data, calling_fn_details=function_references
             )
         except Exception as e:
             logger.debug(f"optimization review response failed, investigate {e}")
@@ -2223,7 +2212,6 @@ def submit_test_generation_tasks(
                 test_index,
                 test_path,
                 test_perf_path,
-                call_sequence=test_index + 1,
             )
             for test_index, (test_path, test_perf_path) in enumerate(
                 zip(generated_test_paths, generated_perf_test_paths)
diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py
index d94455df3..8d187f2b1 100644
--- a/codeflash/verification/verifier.py
+++ b/codeflash/verification/verifier.py
@@ -27,7 +27,6 @@ def generate_tests(
     test_index: int,
     test_path: Path,
     test_perf_path: Path,
-    call_sequence: int | None = None,
 ) -> tuple[str, str, Path] | None:
     # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original
     #  class import. Remove the recreation of the class definition
@@ -43,7 +42,6 @@ def generate_tests(
         test_timeout=test_timeout,
         trace_id=function_trace_id,
         test_index=test_index,
-        call_sequence=call_sequence,
     )
     if response and isinstance(response, tuple) and len(response) == 3:
         generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response

From 2237ad6b893070ed480cafb5a9a5f08658b3090c Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 30 Dec 2025 03:49:36 -0500
Subject: [PATCH 10/11] cleanup UI

---
 codeflash/discovery/discover_unit_tests.py   | 1 +
 codeflash/optimization/function_optimizer.py | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py
index bc0e2fd67..587b972ee 100644
--- a/codeflash/discovery/discover_unit_tests.py
+++ b/codeflash/discovery/discover_unit_tests.py
@@ -751,6 +751,7 @@ def process_test_files(
 
     tests_cache = TestsCache(project_root_path)
     logger.info("!lsp|Discovering tests and processing unit tests")
+    console.rule()
     with test_files_progress_bar(total=len(file_to_test_map), description="Processing test files") as (
         progress,
         task_id,
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 45adf8d44..2a65b57ba 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -245,6 +245,7 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
             logger.info(
                 f"Added {len(refinement_response)} candidates from refinement, total candidates now: {self.candidate_len}"
             )
+            console.rule()
         self.refinement_done = True
 
         return self.get_next_candidate()
@@ -1213,7 +1214,6 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio
         func_qualname = self.function_to_optimize.qualified_name_with_modules_from_root(self.project_root)
         if func_qualname not in function_to_all_tests:
             logger.info(f"Did not find any pre-existing tests for '{func_qualname}', will only use generated tests.")
-            console.rule()
         else:
             test_file_invocation_positions = defaultdict(list)
             for tests_in_file in function_to_all_tests.get(func_qualname):
@@ -1349,7 +1349,8 @@ def generate_tests(
         if concolic_test_str:
             count_tests += 1
 
-        logger.info(f"!lsp|Generated '{count_tests}' tests for '{self.function_to_optimize.function_name}'")
+        logger.info(f"!lsp|Generated {count_tests} tests for '{self.function_to_optimize.function_name}'")
+        console.rule()
 
         generated_tests = GeneratedTestsList(generated_tests=tests)
         return Success((count_tests, generated_tests, function_to_concolic_tests, concolic_test_str))
@@ -1398,7 +1399,6 @@ def generate_optimizations(
 
         # Retrieve results - optimize_python_code returns list of candidates
         candidates = future_optimization_candidates.result()
-        logger.info(f"!lsp|Received {len(candidates)} optimization candidates.")
 
         if not candidates:
             return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}")
@@ -2026,6 +2026,7 @@ def run_optimized_candidate(
                 return self.get_results_not_matched_error()
 
             logger.info(f"loading|Running performance tests for candidate {optimization_candidate_index}...")
+            console.rule()
 
             # For async functions, instrument at definition site for performance benchmarking
             if self.function_to_optimize.is_async:

From 1be8302255da4875b5d177d1e57706fad3cf6175 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Wed, 31 Dec 2025 03:12:35 -0500
Subject: [PATCH 11/11] revert timeouts

---
 codeflash/api/aiservice.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index df279fd6f..c18495899 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -162,7 +162,7 @@ def optimize_python_code(  # noqa: D417
         logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}")
 
         try:
-            response = self.make_ai_service_request("/optimize", payload=payload, timeout=120)
+            response = self.make_ai_service_request("/optimize", payload=payload, timeout=60)
         except requests.exceptions.RequestException as e:
             logger.exception(f"Error generating optimized candidates: {e}")
             ph("cli-optimize-error-caught", {"error": str(e)})
@@ -228,7 +228,7 @@ def optimize_python_code_line_profiler(  # noqa: D417
         }
 
         try:
-            response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=120)
+            response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=60)
         except requests.exceptions.RequestException as e:
             logger.exception(f"Error generating optimized candidates: {e}")
             ph("cli-optimize-error-caught", {"error": str(e)})