augmentcode · c-flaherty · Mar 31, 2025 · Mar 31, 2025 · Mar 31, 2025 · Mar 31, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,28 @@
+name: CI - pre-commit & run tests
+
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  lint-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11.7'  # or your preferred version
+
+      - name: Install dependencies
+        run: |
+          ./setup.sh
+
+      - name: Run pre-commit
+        run: uv run pre-commit run --all-files
+
+      - name: Run pytest
+        run: uv run pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,42 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0  # Use the latest stable version
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-symlinks
+      - id: check-added-large-files
+      - id: check-case-conflict
+      - id: check-json
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.11.2
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args: [ --fix ]
+      # Run the formatter.
+      - id: ruff-format
+      # manual stages to auto-correct
+      - id: ruff
+        args: [ --fix ]
+        stages: [manual]
+      - id: ruff-format
+        stages: [manual]
+
+  - repo: https://github.com/Yelp/detect-secrets
+    rev: v1.5.0
+    hooks:
+      - id: detect-secrets
+        args: ['--baseline', '.secrets.baseline', 'audit']
+
+  - repo: local
+    hooks:
+    - id: pyright
+      name: pyright
+      entry: pyright
+      language: system
+      types: [python]
+      args: [--stats, -p, pyrightconfig.ci.json]
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -0,0 +1,137 @@
+{
+  "version": "1.5.0",
+  "plugins_used": [
+    {
+      "name": "ArtifactoryDetector"
+    },
+    {
+      "name": "AWSKeyDetector"
+    },
+    {
+      "name": "AzureStorageKeyDetector"
+    },
+    {
+      "name": "Base64HighEntropyString",
+      "limit": 4.5
+    },
+    {
+      "name": "BasicAuthDetector"
+    },
+    {
+      "name": "CloudantDetector"
+    },
+    {
+      "name": "DiscordBotTokenDetector"
+    },
+    {
+      "name": "GitHubTokenDetector"
+    },
+    {
+      "name": "GitLabTokenDetector"
+    },
+    {
+      "name": "HexHighEntropyString",
+      "limit": 3.0
+    },
+    {
+      "name": "IbmCloudIamDetector"
+    },
+    {
+      "name": "IbmCosHmacDetector"
+    },
+    {
+      "name": "IPPublicDetector"
+    },
+    {
+      "name": "JwtTokenDetector"
+    },
+    {
+      "name": "KeywordDetector",
+      "keyword_exclude": ""
+    },
+    {
+      "name": "MailchimpDetector"
+    },
+    {
+      "name": "NpmDetector"
+    },
+    {
+      "name": "OpenAIDetector"
+    },
+    {
+      "name": "PrivateKeyDetector"
+    },
+    {
+      "name": "PypiTokenDetector"
+    },
+    {
+      "name": "SendGridDetector"
+    },
+    {
+      "name": "SlackDetector"
+    },
+    {
+      "name": "SoftlayerDetector"
+    },
+    {
+      "name": "SquareOAuthDetector"
+    },
+    {
+      "name": "StripeDetector"
+    },
+    {
+      "name": "TelegramBotTokenDetector"
+    },
+    {
+      "name": "TwilioKeyDetector"
+    }
+  ],
+  "filters_used": [
+    {
+      "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
+    },
+    {
+      "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
+      "min_level": 2
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_indirect_reference"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_likely_id_string"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_lock_file"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_potential_uuid"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_sequential_string"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_swagger_file"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_templated_secret"
+    }
+  ],
+  "results": {
+    "setup.sh": [
+      {
+        "type": "Hex High Entropy String",
+        "filename": "setup.sh",
+        "hashed_secret": "7431e16af96558909e41438950a6ffe7ee811465",
+        "is_verified": false,
+        "line_number": 31
+      }
+    ]
+  },
+  "generated_at": "2025-03-31T20:57:53Z"
+}
diff --git a/LICENSE b/LICENSE
@@ -30,4 +30,4 @@ licensed under the MIT License. The original code has been modified.
 Original copyright:
 © 2024 Anthropic, PBC
 
-Original license: https://github.com/modelcontextprotocol/servers/blob/main/LICENSE
+Original license: https://github.com/modelcontextprotocol/servers/blob/main/LICENSE
diff --git a/README.md b/README.md
@@ -120,7 +120,7 @@ You can increase `--num-examples` and `--num-candidate-solutions` to run on more
 
 ### Running on more examples.
 
-There are 500 examples total in SWE-bench Verified. Note that this can take awhile, so there are a few levels of parallelism this repository supports. 
+There are 500 examples total in SWE-bench Verified. Note that this can take awhile, so there are a few levels of parallelism this repository supports.
 - Firstly, we suggest running 8 processes. This is the `--num-processes` flag. Beyond this, Docker hits issues.
 - Secondly, we support a notion of breaking up the dataset into shards. This is the `--shard-ct` and `--shard-id` flags. This makes it relatively easy to split up the work across multiple machines, which circumnvents the issues with scaling Docker byeond 8 processes.
 
@@ -166,7 +166,7 @@ python majority_vote_ensembler.py example_ensembler_data.jsonl --output_path exa
 
 #### Input Format
 
-The input JSONL file should contain a list of problem objects, each with the following structure:
+The input JSONL file should contain a list of problem objects, each with the following structure. The `diffs` are the candidate solutions generated by the agent. The `eval_outcomes` are the results of running the eval harness on each candidate solution, where the index corresponds to the index in the `diffs` array.
 
 ```json
 {

diff --git a/cli.py b/cli.py
@@ -16,7 +16,6 @@
 from rich.panel import Panel
 from prompt_toolkit import prompt
 from prompt_toolkit.history import InMemoryHistory
-from termcolor import colored
 
 from tools.agent import Agent
 from utils.workspace_manager import WorkspaceManager
@@ -26,6 +25,7 @@
 MAX_OUTPUT_TOKENS_PER_TURN = 32768
 MAX_TURNS = 200
 
+
 def main():
     """Main entry point for the CLI."""
     # Parse command-line arguments
@@ -84,7 +84,7 @@ def main():
     if not args.minimize_stdout_logs:
         logger_for_agent_logs.addHandler(logging.StreamHandler())
     else:
-        logger_for_agent_logs.propagate = False        
+        logger_for_agent_logs.propagate = False
 
     # Check if ANTHROPIC_API_KEY is set
     if "ANTHROPIC_API_KEY" not in os.environ:
@@ -108,7 +108,9 @@ def main():
             )
         )
     else:
-        logger_for_agent_logs.info("Agent CLI started. Waiting for user input. Press Ctrl+C to exit. Type 'exit' or 'quit' to end the session.")
+        logger_for_agent_logs.info(
+            "Agent CLI started. Waiting for user input. Press Ctrl+C to exit. Type 'exit' or 'quit' to end the session."
+        )
 
     # Initialize LLM client
     client = get_client(
@@ -119,7 +121,9 @@ def main():
 
     # Initialize workspace manager
     workspace_path = Path(args.workspace).resolve()
-    workspace_manager = WorkspaceManager(root=workspace_path, container_workspace=args.use_container_workspace)
+    workspace_manager = WorkspaceManager(
+        root=workspace_path, container_workspace=args.use_container_workspace
+    )
 
     # Initialize agent
     agent = Agent(
@@ -135,8 +139,12 @@ def main():
 
     if args.problem_statement is not None:
         instruction = INSTRUCTION_PROMPT.format(
-            location=workspace_path if args.use_container_workspace is None else args.use_container_workspace,
-            pr_description=args.problem_statement
+            location=(
+                workspace_path
+                if args.use_container_workspace is None
+                else args.use_container_workspace
+            ),
+            pr_description=args.problem_statement,
         )
     else:
         instruction = None
@@ -157,7 +165,9 @@ def main():
                     break
             else:
                 user_input = instruction
-                logger_for_agent_logs.info(f"User instruction:\n{user_input}\n-------------")
+                logger_for_agent_logs.info(
+                    f"User instruction:\n{user_input}\n-------------"
+                )
 
             # Run the agent with the user input
             logger_for_agent_logs.info("\nAgent is thinking...")
@@ -167,7 +177,7 @@ def main():
             except Exception as e:
                 logger_for_agent_logs.info(f"Error: {str(e)}")
 
-            logger_for_agent_logs.info("\n" + '-' * 40 + "\n")
+            logger_for_agent_logs.info("\n" + "-" * 40 + "\n")
 
             if instruction is not None:
                 break
@@ -179,4 +189,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/example_ensembler_results.json b/example_ensembler_results.json
@@ -23,4 +23,4 @@
     "selected_diff": "@@ -45,3 +45,12 @@ def is_palindrome(text):\n     cleaned_text = ''.join(c.lower() for c in text if c.isalnum())\n     return cleaned_text == cleaned_text[::-1]\n \n+def is_valid_email(email):\n+    \"\"\"\n+    Check if a string is a valid email address.\n+    \"\"\"\n+    import re\n+    \n+    # Simple regex pattern for email validation\n+    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'\n+    return bool(re.match(pattern, email))\n",
     "is_eval_success": true
   }
-]
+]
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,4 +23,4 @@ @@
         "selected_diff": "@@ -45,3 +45,12 @@ def is_palindrome(text):\n     cleaned_text = ''.join(c.lower() for c in text if c.isalnum())\n     return cleaned_text == cleaned_text[::-1]\n \n+def is_valid_email(email):\n+    \"\"\"\n+    Check if a string is a valid email address.\n+    \"\"\"\n+    import re\n+    \n+    # Simple regex pattern for email validation\n+    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'\n+    return bool(re.match(pattern, email))\n",
         "is_eval_success": true
       }
-    ]
+    ]