Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: CI - pre-commit & run tests

on:
pull_request:
branches: [ main ]

jobs:
lint-and-test:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11.7' # or your preferred version

- name: Install dependencies
run: |
./setup.sh

- name: Run pre-commit
run: uv run pre-commit run --all-files

- name: Run pytest
run: uv run pytest
42 changes: 42 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0 # Use the latest stable version
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-symlinks
- id: check-added-large-files
- id: check-case-conflict
- id: check-json

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.11.2
hooks:
# Run the linter.
- id: ruff
args: [ --fix ]
# Run the formatter.
- id: ruff-format
# manual stages to auto-correct
- id: ruff
args: [ --fix ]
stages: [manual]
- id: ruff-format
stages: [manual]

- repo: https://github.com/Yelp/detect-secrets
rev: v1.5.0
hooks:
- id: detect-secrets
args: ['--baseline', '.secrets.baseline', 'audit']

- repo: local
hooks:
- id: pyright
name: pyright
entry: pyright
language: system
types: [python]
args: [--stats, -p, pyrightconfig.ci.json]
137 changes: 137 additions & 0 deletions .secrets.baseline
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
{
"version": "1.5.0",
"plugins_used": [
{
"name": "ArtifactoryDetector"
},
{
"name": "AWSKeyDetector"
},
{
"name": "AzureStorageKeyDetector"
},
{
"name": "Base64HighEntropyString",
"limit": 4.5
},
{
"name": "BasicAuthDetector"
},
{
"name": "CloudantDetector"
},
{
"name": "DiscordBotTokenDetector"
},
{
"name": "GitHubTokenDetector"
},
{
"name": "GitLabTokenDetector"
},
{
"name": "HexHighEntropyString",
"limit": 3.0
},
{
"name": "IbmCloudIamDetector"
},
{
"name": "IbmCosHmacDetector"
},
{
"name": "IPPublicDetector"
},
{
"name": "JwtTokenDetector"
},
{
"name": "KeywordDetector",
"keyword_exclude": ""
},
{
"name": "MailchimpDetector"
},
{
"name": "NpmDetector"
},
{
"name": "OpenAIDetector"
},
{
"name": "PrivateKeyDetector"
},
{
"name": "PypiTokenDetector"
},
{
"name": "SendGridDetector"
},
{
"name": "SlackDetector"
},
{
"name": "SoftlayerDetector"
},
{
"name": "SquareOAuthDetector"
},
{
"name": "StripeDetector"
},
{
"name": "TelegramBotTokenDetector"
},
{
"name": "TwilioKeyDetector"
}
],
"filters_used": [
{
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
},
{
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
"min_level": 2
},
{
"path": "detect_secrets.filters.heuristic.is_indirect_reference"
},
{
"path": "detect_secrets.filters.heuristic.is_likely_id_string"
},
{
"path": "detect_secrets.filters.heuristic.is_lock_file"
},
{
"path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
},
{
"path": "detect_secrets.filters.heuristic.is_potential_uuid"
},
{
"path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
},
{
"path": "detect_secrets.filters.heuristic.is_sequential_string"
},
{
"path": "detect_secrets.filters.heuristic.is_swagger_file"
},
{
"path": "detect_secrets.filters.heuristic.is_templated_secret"
}
],
"results": {
"setup.sh": [
{
"type": "Hex High Entropy String",
"filename": "setup.sh",
"hashed_secret": "7431e16af96558909e41438950a6ffe7ee811465",
"is_verified": false,
"line_number": 31
}
]
},
"generated_at": "2025-03-31T20:57:53Z"
}
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ licensed under the MIT License. The original code has been modified.
Original copyright:
© 2024 Anthropic, PBC

Original license: https://github.com/modelcontextprotocol/servers/blob/main/LICENSE
Original license: https://github.com/modelcontextprotocol/servers/blob/main/LICENSE
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ You can increase `--num-examples` and `--num-candidate-solutions` to run on more

### Running on more examples.

There are 500 examples total in SWE-bench Verified. Note that this can take awhile, so there are a few levels of parallelism this repository supports.
There are 500 examples total in SWE-bench Verified. Note that this can take awhile, so there are a few levels of parallelism this repository supports.
- Firstly, we suggest running 8 processes. This is the `--num-processes` flag. Beyond this, Docker hits issues.
- Secondly, we support a notion of breaking up the dataset into shards. This is the `--shard-ct` and `--shard-id` flags. This makes it relatively easy to split up the work across multiple machines, which circumnvents the issues with scaling Docker byeond 8 processes.

Expand Down Expand Up @@ -166,7 +166,7 @@ python majority_vote_ensembler.py example_ensembler_data.jsonl --output_path exa

#### Input Format

The input JSONL file should contain a list of problem objects, each with the following structure:
The input JSONL file should contain a list of problem objects, each with the following structure. The `diffs` are the candidate solutions generated by the agent. The `eval_outcomes` are the results of running the eval harness on each candidate solution, where the index corresponds to the index in the `diffs` array.

```json
{
Expand Down
28 changes: 19 additions & 9 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from rich.panel import Panel
from prompt_toolkit import prompt
from prompt_toolkit.history import InMemoryHistory
from termcolor import colored

from tools.agent import Agent
from utils.workspace_manager import WorkspaceManager
Expand All @@ -26,6 +25,7 @@
MAX_OUTPUT_TOKENS_PER_TURN = 32768
MAX_TURNS = 200


def main():
"""Main entry point for the CLI."""
# Parse command-line arguments
Expand Down Expand Up @@ -84,7 +84,7 @@ def main():
if not args.minimize_stdout_logs:
logger_for_agent_logs.addHandler(logging.StreamHandler())
else:
logger_for_agent_logs.propagate = False
logger_for_agent_logs.propagate = False

# Check if ANTHROPIC_API_KEY is set
if "ANTHROPIC_API_KEY" not in os.environ:
Expand All @@ -108,7 +108,9 @@ def main():
)
)
else:
logger_for_agent_logs.info("Agent CLI started. Waiting for user input. Press Ctrl+C to exit. Type 'exit' or 'quit' to end the session.")
logger_for_agent_logs.info(
"Agent CLI started. Waiting for user input. Press Ctrl+C to exit. Type 'exit' or 'quit' to end the session."
)

# Initialize LLM client
client = get_client(
Expand All @@ -119,7 +121,9 @@ def main():

# Initialize workspace manager
workspace_path = Path(args.workspace).resolve()
workspace_manager = WorkspaceManager(root=workspace_path, container_workspace=args.use_container_workspace)
workspace_manager = WorkspaceManager(
root=workspace_path, container_workspace=args.use_container_workspace
)

# Initialize agent
agent = Agent(
Expand All @@ -135,8 +139,12 @@ def main():

if args.problem_statement is not None:
instruction = INSTRUCTION_PROMPT.format(
location=workspace_path if args.use_container_workspace is None else args.use_container_workspace,
pr_description=args.problem_statement
location=(
workspace_path
if args.use_container_workspace is None
else args.use_container_workspace
),
pr_description=args.problem_statement,
)
else:
instruction = None
Expand All @@ -157,7 +165,9 @@ def main():
break
else:
user_input = instruction
logger_for_agent_logs.info(f"User instruction:\n{user_input}\n-------------")
logger_for_agent_logs.info(
f"User instruction:\n{user_input}\n-------------"
)

# Run the agent with the user input
logger_for_agent_logs.info("\nAgent is thinking...")
Expand All @@ -167,7 +177,7 @@ def main():
except Exception as e:
logger_for_agent_logs.info(f"Error: {str(e)}")

logger_for_agent_logs.info("\n" + '-' * 40 + "\n")
logger_for_agent_logs.info("\n" + "-" * 40 + "\n")

if instruction is not None:
break
Expand All @@ -179,4 +189,4 @@ def main():


if __name__ == "__main__":
main()
main()
2 changes: 1 addition & 1 deletion example_ensembler_results.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@
"selected_diff": "@@ -45,3 +45,12 @@ def is_palindrome(text):\n cleaned_text = ''.join(c.lower() for c in text if c.isalnum())\n return cleaned_text == cleaned_text[::-1]\n \n+def is_valid_email(email):\n+ \"\"\"\n+ Check if a string is a valid email address.\n+ \"\"\"\n+ import re\n+ \n+ # Simple regex pattern for email validation\n+ pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'\n+ return bool(re.match(pattern, email))\n",
"is_eval_success": true
}
]
]
Loading