Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions codeflash/api/aiservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ def optimize_python_code_line_profiler( # noqa: D417
dependency_code: str,
trace_id: str,
line_profiler_results: str,
num_candidates: int = 8,
experiment_metadata: ExperimentMetadata | None = None,
) -> list[OptimizedCandidate]:
"""Optimize the given python code for performance using line profiler results.
Expand Down Expand Up @@ -218,6 +219,7 @@ def optimize_python_code_line_profiler( # noqa: D417
payload = {
"source_code": source_code,
"dependency_code": dependency_code,
"n_candidates_lp": num_candidates,
"line_profiler_results": line_profiler_results,
"trace_id": trace_id,
"python_version": platform.python_version(),
Expand Down
3 changes: 3 additions & 0 deletions codeflash/cli_cmds/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ def parse_args() -> Namespace:
action="store_true",
help="(Deprecated) Async function optimization is now enabled by default. This flag is ignored.",
)
parser.add_argument(
"--effort", type=str, help="Effort level for optimization", choices=["low", "medium", "high"], default="medium"
)

args, unknown_args = parser.parse_known_args()
sys.argv[:] = [sys.argv[0], *unknown_args]
Expand Down
65 changes: 47 additions & 18 deletions codeflash/code_utils/config_consts.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
from enum import StrEnum, auto

MAX_TEST_RUN_ITERATIONS = 5
INDIVIDUAL_TESTCASE_TIMEOUT = 15
MAX_FUNCTION_TEST_SECONDS = 60
N_CANDIDATES = 5
MIN_IMPROVEMENT_THRESHOLD = 0.05
MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD = 0.10 # 10% minimum improvement for async throughput
MAX_TEST_FUNCTION_RUNS = 50
MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS = 100e6 # 100ms
N_TESTS_TO_GENERATE = 2
TOTAL_LOOPING_TIME = 10.0 # 10 second candidate benchmarking budget
COVERAGE_THRESHOLD = 60.0
MIN_TESTCASE_PASSED_THRESHOLD = 6
REPEAT_OPTIMIZATION_PROBABILITY = 0.1
DEFAULT_IMPORTANCE_THRESHOLD = 0.001
N_CANDIDATES_LP = 6

# pytest loop stability
# For now, we use strict thresholds (large windows and low tolerances), since this is still experimental.
Expand All @@ -21,22 +20,10 @@
STABILITY_SPREAD_TOLERANCE = 0.0025 # 0.25% window spread

# Refinement
REFINE_ALL_THRESHOLD = 2 # when valid optimizations count is 2 or less, refine all optimizations
REFINED_CANDIDATE_RANKING_WEIGHTS = (2, 1) # (runtime, diff), runtime is more important than diff by a factor of 2
TOP_N_REFINEMENTS = 0.45 # top 45% of valid optimizations (based on the weighted score) are refined

# LSP-specific
N_CANDIDATES_LSP = 3
N_TESTS_TO_GENERATE_LSP = 2
TOTAL_LOOPING_TIME_LSP = 10.0 # Kept same timing for LSP mode to avoid in increase in performance reporting
N_CANDIDATES_LP_LSP = 3

# Code repair
REPAIR_UNMATCHED_PERCENTAGE_LIMIT = 0.4 # if the percentage of unmatched tests is greater than this, we won't fix it (lowering this value makes the repair more stricted)
MAX_REPAIRS_PER_TRACE = 4 # maximum number of repairs we will do for each function

MAX_N_CANDIDATES = 5
MAX_N_CANDIDATES_LP = 6

try:
from codeflash.lsp.helpers import is_LSP_enabled
Expand All @@ -45,9 +32,51 @@
except ImportError:
_IS_LSP_ENABLED = False

N_CANDIDATES_EFFECTIVE = min(N_CANDIDATES_LSP if _IS_LSP_ENABLED else N_CANDIDATES, MAX_N_CANDIDATES)
N_CANDIDATES_LP_EFFECTIVE = min(N_CANDIDATES_LP_LSP if _IS_LSP_ENABLED else N_CANDIDATES_LP, MAX_N_CANDIDATES_LP)
N_TESTS_TO_GENERATE_EFFECTIVE = N_TESTS_TO_GENERATE_LSP if _IS_LSP_ENABLED else N_TESTS_TO_GENERATE
TOTAL_LOOPING_TIME_EFFECTIVE = TOTAL_LOOPING_TIME_LSP if _IS_LSP_ENABLED else TOTAL_LOOPING_TIME

MAX_CONTEXT_LEN_REVIEW = 1000


class EffortLevel(StrEnum):
LOW = auto()
MEDIUM = auto()
HIGH = auto()


class EffortKeys(StrEnum):
N_OPTIMIZER_CANDIDATES = auto()
N_OPTIMIZER_LP_CANDIDATES = auto()
N_GENERATED_TESTS = auto()
MAX_CODE_REPAIRS_PER_TRACE = auto()
REPAIR_UNMATCHED_PERCENTAGE_LIMIT = auto()
TOP_VALID_CANDIDATES_FOR_REFINEMENT = auto()


EFFORT_VALUES: dict[str, dict[EffortLevel, any]] = {
EffortKeys.N_OPTIMIZER_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5},
EffortKeys.N_OPTIMIZER_LP_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 5, EffortLevel.HIGH: 6},
# we don't use effort with generated tests for now
EffortKeys.N_GENERATED_TESTS.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 2, EffortLevel.HIGH: 2},
# maximum number of repairs we will do for each function
EffortKeys.MAX_CODE_REPAIRS_PER_TRACE.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5},
# if the percentage of unmatched tests is greater than this, we won't fix it (lowering this value makes the repair more stricted)
# on the low effort we lower the limit to 20% to be more strict (less repairs, less time)
EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT.value: {
EffortLevel.LOW: 0.2,
EffortLevel.MEDIUM: 0.4,
EffortLevel.HIGH: 0.5,
},
# Top valid candidates for refinements
EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4},
}


def get_effort_value(key: EffortKeys, effort: EffortLevel) -> any:
key_str = key.value
if key_str in EFFORT_VALUES:
if effort in EFFORT_VALUES[key_str]:
return EFFORT_VALUES[key_str][effort]
msg = f"Invalid effort level: {effort}"
raise ValueError(msg)
msg = f"Invalid key: {key_str}"
raise ValueError(msg)
34 changes: 0 additions & 34 deletions codeflash/code_utils/git_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from __future__ import annotations

import os
import shutil
import subprocess
import sys
import tempfile
import time
from functools import cache
from io import StringIO
Expand All @@ -16,7 +13,6 @@
from unidiff import PatchSet

from codeflash.cli_cmds.console import logger
from codeflash.code_utils.config_consts import N_CANDIDATES_EFFECTIVE

if TYPE_CHECKING:
from git import Repo
Expand Down Expand Up @@ -195,36 +191,6 @@ def check_and_push_branch(repo: git.Repo, git_remote: str | None = "origin", *,
return True


def create_worktree_root_dir(module_root: Path) -> tuple[Path | None, Path | None]:
git_root = git_root_dir() if check_running_in_git_repo(module_root) else None
worktree_root_dir = Path(tempfile.mkdtemp()) if git_root else None
return git_root, worktree_root_dir


def create_git_worktrees(
git_root: Path | None, worktree_root_dir: Path | None, module_root: Path
) -> tuple[Path | None, list[Path]]:
if git_root and worktree_root_dir:
worktree_root = Path(tempfile.mkdtemp(dir=worktree_root_dir))
worktrees = [Path(tempfile.mkdtemp(dir=worktree_root)) for _ in range(N_CANDIDATES_EFFECTIVE + 1)]
for worktree in worktrees:
subprocess.run(["git", "worktree", "add", "-d", worktree], cwd=module_root, check=True)
else:
worktree_root = None
worktrees = []
return worktree_root, worktrees


def remove_git_worktrees(worktree_root: Path | None, worktrees: list[Path]) -> None:
try:
for worktree in worktrees:
subprocess.run(["git", "worktree", "remove", "-f", worktree], check=True)
except subprocess.CalledProcessError as e:
logger.warning(f"Error removing worktrees: {e}")
if worktree_root:
shutil.rmtree(worktree_root)


def get_last_commit_author_if_pr_exists(repo: Repo | None = None) -> str | None:
"""Return the author's name of the last commit in the current branch if PR_NUMBER is set.

Expand Down
2 changes: 2 additions & 0 deletions codeflash/lsp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pygls.lsp.server import LanguageServer
from pygls.protocol import LanguageServerProtocol

from codeflash.code_utils.config_consts import EffortLevel
from codeflash.either import Result
from codeflash.models.models import CodeOptimizationContext

Expand Down Expand Up @@ -37,6 +38,7 @@ def prepare_optimizer_arguments(self, config_file: Path) -> None:
args.config_file = config_file
args.no_pr = True # LSP server should not create PRs
args.worktree = True
args.effort = EffortLevel.LOW.value # low effort for high speed
self.args = args
# avoid initializing the optimizer during initialization, because it can cause an error if the api key is invalid

Expand Down
35 changes: 22 additions & 13 deletions codeflash/optimization/function_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,11 @@
from codeflash.code_utils.config_consts import (
COVERAGE_THRESHOLD,
INDIVIDUAL_TESTCASE_TIMEOUT,
MAX_REPAIRS_PER_TRACE,
N_TESTS_TO_GENERATE_EFFECTIVE,
REFINE_ALL_THRESHOLD,
REFINED_CANDIDATE_RANKING_WEIGHTS,
REPAIR_UNMATCHED_PERCENTAGE_LIMIT,
REPEAT_OPTIMIZATION_PROBABILITY,
TOP_N_REFINEMENTS,
TOTAL_LOOPING_TIME_EFFECTIVE,
EffortKeys,
get_effort_value,
)
from codeflash.code_utils.deduplicate_code import normalize_code
from codeflash.code_utils.edit_generated_tests import (
Expand Down Expand Up @@ -137,13 +134,15 @@ def __init__(
ai_service_client: AiServiceClient,
executor: concurrent.futures.ThreadPoolExecutor,
future_all_code_repair: list[concurrent.futures.Future],
effort: str,
) -> None:
self.candidate_queue = queue.Queue()
self.line_profiler_done = False
self.refinement_done = False
self.candidate_len = len(initial_candidates)
self.ai_service_client = ai_service_client
self.executor = executor
self.effort = effort
self.refinement_calls_count = 0

# Initialize queue with initial candidates
Expand Down Expand Up @@ -195,9 +194,16 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur
def _process_refinement_results(self) -> OptimizedCandidate | None:
"""Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined."""
future_refinements: list[concurrent.futures.Future] = []
top_n_candidates = int(
min(
int(get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort)),
len(self.all_refinements_data),
)
)
refinement_call_index = 0

if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD:
if top_n_candidates == len(self.all_refinements_data):
# if we'll refine all candidates, we can skip the ranking and just refine them all
for data in self.all_refinements_data:
refinement_call_index += 1
future_refinements.append(self.refine_optimizations([data]))
Expand All @@ -215,7 +221,6 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
diffs_norm = normalize_by_max(diff_lens_list)
# the lower the better
score_dict = create_score_dictionary_from_metrics(weights, runtime_norm, diffs_norm)
top_n_candidates = int((TOP_N_REFINEMENTS * len(runtimes_list)) + 0.5)
top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates]

for idx in top_indecies:
Expand Down Expand Up @@ -321,7 +326,7 @@ def __init__(
self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {}
self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {}
self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None
n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, args.effort)
self.executor = concurrent.futures.ThreadPoolExecutor(
max_workers=n_tests + 3 if self.experiment_id is None else n_tests + 4
)
Expand Down Expand Up @@ -371,7 +376,7 @@ def generate_and_instrument_tests(
str,
]:
"""Generate and instrument tests for the function."""
n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.args.effort)
generated_test_paths = [
get_test_file_path(
self.test_cfg.tests_root, self.function_to_optimize.function_name, test_index, test_type="unit"
Expand Down Expand Up @@ -936,6 +941,7 @@ def determine_best_candidate(
dependency_code=code_context.read_only_context_code,
trace_id=self.get_trace_id(exp_type),
line_profiler_results=original_code_baseline.line_profile_results["str_out"],
num_candidates=get_effort_value(EffortKeys.N_OPTIMIZER_LP_CANDIDATES, self.args.effort),
experiment_metadata=ExperimentMetadata(
id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment"
)
Expand All @@ -950,6 +956,7 @@ def determine_best_candidate(
self.aiservice_client,
self.executor,
self.future_all_code_repair,
self.args.effort,
)
candidate_index = 0

Expand Down Expand Up @@ -1299,7 +1306,7 @@ def generate_tests(
generated_perf_test_paths: list[Path],
) -> Result[tuple[int, GeneratedTestsList, dict[str, set[FunctionCalledInTest]], str], str]:
"""Generate unit tests and concolic tests for the function."""
n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.args.effort)
assert len(generated_test_paths) == n_tests

if not self.args.no_gen_tests:
Expand Down Expand Up @@ -1366,6 +1373,7 @@ def generate_optimizations(
run_experiment: bool = False, # noqa: FBT001, FBT002
) -> Result[tuple[OptimizationSet, str], str]:
"""Generate optimization candidates for the function. Backend handles multi-model diversity."""
# n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.args.effort)
future_optimization_candidates = self.executor.submit(
self.aiservice_client.optimize_python_code,
read_writable_code.markdown,
Expand Down Expand Up @@ -1930,8 +1938,9 @@ def repair_if_possible(
test_results_count: int,
exp_type: str,
) -> None:
if self.repair_counter >= MAX_REPAIRS_PER_TRACE:
logger.debug(f"Repair counter reached {MAX_REPAIRS_PER_TRACE}, skipping repair")
max_repairs = get_effort_value(EffortKeys.MAX_CODE_REPAIRS_PER_TRACE, self.args.effort)
if self.repair_counter >= max_repairs:
logger.debug(f"Repair counter reached {max_repairs}, skipping repair")
return
if candidate.source not in (OptimizedCandidateSource.OPTIMIZE, OptimizedCandidateSource.OPTIMIZE_LP):
# only repair the first pass of the candidates for now
Expand All @@ -1941,7 +1950,7 @@ def repair_if_possible(
logger.debug("No diffs found, skipping repair")
return
result_unmatched_perc = len(diffs) / test_results_count
if result_unmatched_perc > REPAIR_UNMATCHED_PERCENTAGE_LIMIT:
if result_unmatched_perc > get_effort_value(EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT, self.args.effort):
logger.debug(f"Result unmatched percentage is {result_unmatched_perc * 100}%, skipping repair")
return

Expand Down
2 changes: 2 additions & 0 deletions codeflash/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from codeflash.cli_cmds.console import console
from codeflash.code_utils.code_utils import get_run_tmp_file
from codeflash.code_utils.compat import SAFE_SYS_EXECUTABLE
from codeflash.code_utils.config_consts import EffortLevel
from codeflash.code_utils.config_parser import parse_config_file
from codeflash.tracing.pytest_parallelization import pytest_split

Expand Down Expand Up @@ -214,6 +215,7 @@ def main(args: Namespace | None = None) -> ArgumentParser:

from codeflash.optimization import optimizer

args.effort = EffortLevel.HIGH.value
optimizer.run_with_args(args)

# Delete the trace file and the replay test file if they exist
Expand Down
Loading