From f38294e729109744c956b6c42ca1a63d313aa4ce Mon Sep 17 00:00:00 2001 From: Rich Hankins Date: Mon, 15 Dec 2025 22:11:05 +0000 Subject: [PATCH] Add github workflow --- .github/workflows/index.yml | 80 ++++++++ src/__init__.py | 24 +++ src/file_filter.py | 123 +++++++++++ src/github_client.py | 307 ++++++++++++++++++++++++++++ src/index_manager.py | 395 ++++++++++++++++++++++++++++++++++++ src/main.py | 167 +++++++++++++++ src/models.py | 131 ++++++++++++ src/search.py | 132 ++++++++++++ 8 files changed, 1359 insertions(+) create mode 100644 .github/workflows/index.yml create mode 100644 src/__init__.py create mode 100644 src/file_filter.py create mode 100644 src/github_client.py create mode 100644 src/index_manager.py create mode 100644 src/main.py create mode 100644 src/models.py create mode 100644 src/search.py diff --git a/.github/workflows/index.yml b/.github/workflows/index.yml new file mode 100644 index 0000000..43c349a --- /dev/null +++ b/.github/workflows/index.yml @@ -0,0 +1,80 @@ +name: Index Repository + +on: + push: + branches: + - main + - develop + - 'feature/**' # Index feature branches + - 'release/**' # Index release branches + workflow_dispatch: + inputs: + branch: + description: 'Branch to index (leave empty for current branch)' + required: false + type: string + force_full_reindex: + description: 'Force full re-index' + required: false + type: boolean + default: false + +jobs: + index: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for comparison + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: pip install -r requirements.txt + + - name: Restore index state + uses: actions/cache@v4 + with: + path: .augment-index-state + # Use branch-specific cache key + key: augment-index-${{ github.ref_name }}-${{ github.sha }} + restore-keys: | + augment-index-${{ github.ref_name }}- + + - name: Index repository + id: index + run: python src/main.py + env: + AUGMENT_API_TOKEN: ${{ secrets.AUGMENT_API_TOKEN }} + AUGMENT_API_URL: ${{ secrets.AUGMENT_API_URL }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + STORAGE_TYPE: file + # Branch-specific state path (automatically determined from GITHUB_REF) + # STATE_PATH is optional - defaults to .augment-index-state/{branch}/state.json + MAX_COMMITS: 100 + MAX_FILES: 500 + + - name: Print results + if: always() + run: | + echo "Success: ${{ steps.index.outputs.success }}" + echo "Type: ${{ steps.index.outputs.type }}" + echo "Files Indexed: ${{ steps.index.outputs.files_indexed }}" + echo "Files Deleted: ${{ steps.index.outputs.files_deleted }}" + echo "Checkpoint ID: ${{ steps.index.outputs.checkpoint_id }}" + echo "Commit SHA: ${{ steps.index.outputs.commit_sha }}" + + - name: Upload state artifact + if: success() + uses: actions/upload-artifact@v4 + with: + name: index-state + path: .augment-index-state/ + retention-days: 30 + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..499dfe6 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,24 @@ +""" +GitHub Action Repository Indexer + +A Python example showing how to index a GitHub repository using the Augment SDK +Direct Mode with incremental updates. + +See README.md for usage instructions. +""" + +from .models import FileChange, IndexConfig, IndexResult, IndexState +from .file_filter import should_filter_file +from .github_client import GitHubClient +from .index_manager import IndexManager + +__all__ = [ + "FileChange", + "IndexConfig", + "IndexResult", + "IndexState", + "should_filter_file", + "GitHubClient", + "IndexManager", +] + diff --git a/src/file_filter.py b/src/file_filter.py new file mode 100644 index 0000000..88ab035 --- /dev/null +++ b/src/file_filter.py @@ -0,0 +1,123 @@ +""" +File filtering logic for GitHub repository indexing. +""" + +import re +from pathlib import Path +from typing import Optional + +# Keyish pattern regex - matches files that likely contain secrets/keys +KEYISH_PATTERN = re.compile( + r'^(\.git|.*\.pem|.*\.key|.*\.pfx|.*\.p12|.*\.jks|.*\.keystore|.*\.pkcs12|.*\.crt|.*\.cer|id_rsa|id_ed25519|id_ecdsa|id_dsa)$' +) + +# Default max file size in bytes (1 MB) +DEFAULT_MAX_FILE_SIZE = 1024 * 1024 # 1 MB + + +def always_ignore_path(path: str) -> bool: + """ + Check if a path should always be ignored (security measure). + + Args: + path: The file path to check. + + Returns: + True if the path contains ".." and should be ignored. + """ + return ".." in path + + +def is_keyish_path(path: str) -> bool: + """ + Check if a path matches the keyish pattern (secrets/keys). + + Args: + path: The file path to check. + + Returns: + True if the filename matches patterns for secret/key files. + """ + # Extract filename from path + filename = Path(path).name + return bool(KEYISH_PATTERN.match(filename)) + + +def is_valid_file_size(size_bytes: int, max_file_size: int = DEFAULT_MAX_FILE_SIZE) -> bool: + """ + Check if file size is valid for upload. + + Args: + size_bytes: The size of the file in bytes. + max_file_size: Maximum allowed file size in bytes. Defaults to 1 MB. + + Returns: + True if the file size is within the allowed limit. + """ + return size_bytes <= max_file_size + + +def is_valid_utf8(content: bytes) -> bool: + """ + Check if file content is valid UTF-8 (not binary). + + Args: + content: The file content as bytes. + + Returns: + True if the content is valid UTF-8, False if it's binary or invalid. + """ + try: + content.decode("utf-8") + return True + except UnicodeDecodeError: + return False + + +def should_filter_file( + path: str, + content: bytes, + max_file_size: Optional[int] = None, +) -> dict: + """ + Check if a file should be filtered out. + + Returns {"filtered": True, "reason": "..."} if file should be skipped. + Returns {"filtered": False} if file should be included. + + Priority order (from file-filtering.md): + 1. Path validation (contains "..") + 2. File size check + 3. .augmentignore rules (checked by caller) + 4. Keyish patterns + 5. .gitignore rules (checked by caller) + 6. UTF-8 validation + + Args: + path: The file path to check. + content: The file content as bytes. + max_file_size: Maximum allowed file size in bytes. Defaults to DEFAULT_MAX_FILE_SIZE. + + Returns: + A dict with "filtered" (bool) and optionally "reason" (str) keys. + """ + effective_max_size = max_file_size if max_file_size is not None else DEFAULT_MAX_FILE_SIZE + + # 1. Check for ".." in path (security) + if always_ignore_path(path): + return {"filtered": True, "reason": "path_contains_dotdot"} + + # 2. Check file size + if not is_valid_file_size(len(content), effective_max_size): + return {"filtered": True, "reason": f"file_too_large ({len(content)} bytes)"} + + # 3. Check keyish patterns (secrets/keys) + if is_keyish_path(path): + return {"filtered": True, "reason": "keyish_pattern"} + + # 4. Check UTF-8 validity (binary detection) + if not is_valid_utf8(content): + return {"filtered": True, "reason": "binary_file"} + + return {"filtered": False} + diff --git a/src/github_client.py b/src/github_client.py new file mode 100644 index 0000000..f69bd62 --- /dev/null +++ b/src/github_client.py @@ -0,0 +1,307 @@ +""" +GitHub API client for fetching repository data. +""" + +import io +import tarfile + +import pathspec +import requests +from github import Github +from github.GithubException import GithubException + +from .file_filter import should_filter_file +from .models import FileChange + + +class GitHubClient: + """GitHub API client for fetching repository data.""" + + def __init__(self, token: str) -> None: + """ + Initialize the GitHub client with an authentication token. + + Args: + token: GitHub personal access token or GitHub App token. + """ + self._github = Github(token) + self._token = token + + def resolve_ref(self, owner: str, repo: str, ref: str) -> str: + """ + Resolve a ref (like "HEAD", "main", or a commit SHA) to a commit SHA. + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to resolve. + + Returns: + The full 40-character commit SHA. + + Raises: + Exception: If the ref cannot be resolved. + """ + try: + repository = self._github.get_repo(f"{owner}/{repo}") + commit = repository.get_commit(ref) + return commit.sha + except GithubException as error: + raise Exception( + f'Failed to resolve ref "{ref}" for {owner}/{repo}: {error}' + ) from error + + def download_tarball(self, owner: str, repo: str, ref: str) -> dict[str, str]: + """ + Download repository as tarball and extract files. + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to download. + + Returns: + Dictionary mapping file paths to their contents. + """ + print(f"Downloading tarball for {owner}/{repo}@{ref}...") + + repository = self._github.get_repo(f"{owner}/{repo}") + tarball_url = repository.get_archive_link("tarball", ref) + + # Download tarball (10 minute timeout to handle large repositories) + # Include auth header for private repos + headers = {"Authorization": f"Bearer {self._token}"} + response = requests.get(tarball_url, headers=headers, stream=True, timeout=600) + if not response.ok: + raise Exception(f"Failed to download tarball: {response.reason}") + + # Load ignore patterns + augmentignore, gitignore = self._load_ignore_patterns(owner, repo, ref) + + # Track filtering statistics + files: dict[str, str] = {} + total_files = 0 + filtered_files = 0 + filter_reasons: dict[str, int] = {} + + # Extract files from tarball + tarball_data = io.BytesIO(response.content) + with tarfile.open(fileobj=tarball_data, mode="r:gz") as tar: + for member in tar.getmembers(): + # Skip directories and symlinks + if not member.isfile(): + continue + + total_files += 1 + + # Remove the root directory prefix (e.g., "owner-repo-sha/") + path_parts = member.name.split("/") + path_parts.pop(0) # Remove first component + file_path = "/".join(path_parts) + + if not file_path: + continue + + # Read file contents + file_obj = tar.extractfile(member) + if file_obj is None: + continue + content_bytes = file_obj.read() + + # Apply filtering in priority order: + # 1. .augmentignore + if augmentignore and augmentignore.match_file(file_path): + filtered_files += 1 + filter_reasons["augmentignore"] = filter_reasons.get("augmentignore", 0) + 1 + continue + + # 2. Path validation, file size, keyish patterns, UTF-8 validation + filter_result = should_filter_file(path=file_path, content=content_bytes) + + if filter_result["filtered"]: + filtered_files += 1 + reason = filter_result.get("reason", "unknown") + filter_reasons[reason] = filter_reasons.get(reason, 0) + 1 + continue + + # 3. .gitignore (checked last) + if gitignore and gitignore.match_file(file_path): + filtered_files += 1 + filter_reasons["gitignore"] = filter_reasons.get("gitignore", 0) + 1 + continue + + # File passed all filters + try: + contents = content_bytes.decode("utf-8") + files[file_path] = contents + except UnicodeDecodeError: + # This should not happen if is_valid_utf8() is working correctly + filtered_files += 1 + filter_reasons["decode_error"] = filter_reasons.get("decode_error", 0) + 1 + print(f"Warning: File {file_path} passed UTF-8 validation but failed to decode") + + print(f"Extracted {len(files)} files from tarball") + print(f"Filtered {filtered_files} of {total_files} files. Reasons: {filter_reasons}") + return files + + def compare_commits( + self, owner: str, repo: str, base: str, head: str + ) -> dict: + """ + Compare two commits and get file changes. + """ + print(f"Comparing {base}...{head}...") + + repository = self._github.get_repo(f"{owner}/{repo}") + comparison = repository.compare(base, head) + + files: list[FileChange] = [] + + for file in comparison.files: + change = FileChange( + path=file.filename, + status=self._map_github_status(file.status), + previousFilename=file.previous_filename, + ) + + # Download file contents for added/modified files + if change.status in ("added", "modified"): + try: + contents = self.get_file_contents(owner, repo, file.filename, head) + change.contents = contents + except Exception as error: + print(f"Warning: Failed to download {file.filename}: {error}") + + files.append(change) + + return { + "files": files, + "commits": comparison.total_commits, + "totalChanges": len(comparison.files), + } + + def get_file_contents( + self, owner: str, repo: str, path: str, ref: str + ) -> str: + """ + Get file contents at a specific ref. + + Args: + owner: Repository owner. + repo: Repository name. + path: File path within the repository. + ref: Git ref to get contents at. + + Returns: + The file contents as a string. + + Raises: + Exception: If the path is not a file. + """ + repository = self._github.get_repo(f"{owner}/{repo}") + content = repository.get_contents(path, ref) + + if isinstance(content, list): + raise Exception(f"{path} is not a file") + + return content.decoded_content.decode("utf-8") + + def _load_ignore_patterns( + self, owner: str, repo: str, ref: str + ) -> tuple[pathspec.PathSpec | None, pathspec.PathSpec | None]: + """ + Load .gitignore and .augmentignore patterns separately. + + Returns both filters to maintain proper priority order: + .augmentignore → keyish → .gitignore + + Args: + owner: Repository owner. + repo: Repository name. + ref: Git ref to load patterns from. + + Returns: + Tuple of (augmentignore, gitignore) PathSpec objects, or None if not found. + """ + augmentignore: pathspec.PathSpec | None = None + gitignore: pathspec.PathSpec | None = None + + # Try to load .gitignore + try: + gitignore_content = self.get_file_contents(owner, repo, ".gitignore", ref) + gitignore = pathspec.PathSpec.from_lines("gitwildmatch", gitignore_content.splitlines()) + except Exception: + # .gitignore doesn't exist + pass + + # Try to load .augmentignore + try: + augmentignore_content = self.get_file_contents(owner, repo, ".augmentignore", ref) + augmentignore = pathspec.PathSpec.from_lines("gitwildmatch", augmentignore_content.splitlines()) + except Exception: + # .augmentignore doesn't exist + pass + + return augmentignore, gitignore + + def _map_github_status(self, status: str) -> str: + """ + Map GitHub file status to our FileChange status. + + Args: + status: GitHub file status string. + + Returns: + Normalized status string. + """ + status_map = { + "added": "added", + "modified": "modified", + "removed": "removed", + "renamed": "renamed", + } + return status_map.get(status, "modified") + + def ignore_files_changed( + self, owner: str, repo: str, base: str, head: str + ) -> bool: + """ + Check if ignore files changed between commits. + + Args: + owner: Repository owner. + repo: Repository name. + base: Base commit SHA. + head: Head commit SHA. + + Returns: + True if .gitignore or .augmentignore changed, False otherwise. + """ + repository = self._github.get_repo(f"{owner}/{repo}") + comparison = repository.compare(base, head) + + ignore_files = [".gitignore", ".augmentignore"] + return any(file.filename in ignore_files for file in comparison.files) + + def is_force_push( + self, owner: str, repo: str, base: str, head: str + ) -> bool: + """ + Check if the push was a force push. + + Args: + owner: Repository owner. + repo: Repository name. + base: Base commit SHA. + head: Head commit SHA. + + Returns: + True if the push was a force push, False otherwise. + """ + try: + repository = self._github.get_repo(f"{owner}/{repo}") + repository.compare(base, head) + return False + except GithubException: + # If comparison fails, it's likely a force push + return True diff --git a/src/index_manager.py b/src/index_manager.py new file mode 100644 index 0000000..c2bf48f --- /dev/null +++ b/src/index_manager.py @@ -0,0 +1,395 @@ +""" +Index Manager - Core indexing logic +""" + +import json +import tempfile +from pathlib import Path +from typing import Optional + +from auggie_sdk.context import DirectContext, File + +from .github_client import GitHubClient +from .models import FileChange, IndexConfig, IndexResult, IndexState, RepositoryInfo + +DEFAULT_MAX_COMMITS = 100 +DEFAULT_MAX_FILES = 500 + + +class IndexManager: + """Index Manager - Core indexing logic for GitHub repositories.""" + + def __init__( + self, context: DirectContext, config: IndexConfig, state_path: str + ) -> None: + """ + Initialize the IndexManager. + + Args: + context: DirectContext instance for indexing operations. + config: Configuration for the indexing operation. + state_path: Path to the state file for persistence. + """ + self._context = context + self._config = config + self._state_path = state_path + self._github = GitHubClient(config.githubToken) + + def resolve_commit_sha(self) -> None: + """ + Resolve the current commit ref to an actual commit SHA. + + This handles cases where GITHUB_SHA might be "HEAD" or a branch name. + Updates the config.currentCommit with the resolved SHA. + """ + resolved_sha = self._github.resolve_ref( + self._config.owner, self._config.repo, self._config.currentCommit + ) + self._config.currentCommit = resolved_sha + + def _load_state(self) -> Optional[IndexState]: + """ + Load index state from file system. + + EXTENDING TO OTHER STORAGE BACKENDS: + Replace this method to load state from your preferred storage: + - Redis: Use redis-py client to GET the state JSON + - S3: Use boto3 to get_object from S3 bucket + - Database: Query your database for the state record + + Example for Redis: + import redis + r = redis.Redis.from_url(redis_url) + data = r.get(state_key) + return json.loads(data) if data else None + + Example for S3: + import boto3 + s3 = boto3.client('s3') + response = s3.get_object(Bucket=bucket, Key=key) + data = response['Body'].read().decode('utf-8') + return json.loads(data) + + Returns: + The loaded IndexState or None if the file doesn't exist. + """ + try: + with open(self._state_path, "r", encoding="utf-8") as f: + return json.load(f) + except FileNotFoundError: + return None + + def _save_state(self, state: IndexState) -> None: + """ + Save index state to file system. + + EXTENDING TO OTHER STORAGE BACKENDS: + Replace this method to save state to your preferred storage: + - Redis: Use redis-py client to SET the state JSON + - S3: Use boto3 to put_object to S3 bucket + - Database: Insert or update the state record in your database + + Example for Redis: + import redis + r = redis.Redis.from_url(redis_url) + r.set(state_key, json.dumps(state)) + + Example for S3: + import boto3 + s3 = boto3.client('s3') + s3.put_object( + Bucket=bucket, + Key=key, + Body=json.dumps(state), + ContentType='application/json' + ) + + Note: The state is just a JSON object (IndexState type) that can be + serialized and stored anywhere. For distributed systems, consider using + Redis or a database for shared state across multiple workers. + + Args: + state: The IndexState to save. + """ + # Ensure directory exists + Path(self._state_path).parent.mkdir(parents=True, exist_ok=True) + + # Write state to file + with open(self._state_path, "w", encoding="utf-8") as f: + json.dump(state, f, indent=2) + + def index(self) -> IndexResult: + """ + Main indexing entry point. + + Returns: + IndexResult with success status and indexing details. + """ + print( + f"Starting index for {self._config.owner}/{self._config.repo}" + f"@{self._config.branch}" + ) + + try: + # Load previous state + previous_state = self._load_state() + + # If we have previous state, we'll need to create a new context with the imported state + # For now, we'll handle this in the incremental update logic + + # Determine if we need full re-index + should_reindex, reason = self._should_full_reindex(previous_state) + + if should_reindex: + return self._full_reindex(reason) + + # Perform incremental update + # previous_state is guaranteed to be non-null here + if not previous_state: + raise RuntimeError("previous_state should not be None at this point") + return self._incremental_update(previous_state) + except Exception as error: + print(f"Indexing failed: {error}") + return IndexResult( + success=False, + type="full", + filesIndexed=0, + filesDeleted=0, + checkpointId="", + commitSha=self._config.currentCommit, + error=str(error), + ) + + def _should_full_reindex( + self, previous_state: Optional[IndexState] + ) -> tuple[bool, Optional[str]]: + """ + Determine if full re-index is needed. + + Args: + previous_state: The previous index state, or None if first run. + + Returns: + Tuple of (should_reindex, reason). + """ + # No previous state - first run + if not previous_state: + return (True, "first_run") + + # Different repository + if ( + previous_state["repository"]["owner"] != self._config.owner + or previous_state["repository"]["name"] != self._config.repo + ): + return (True, "different_repository") + + # Same commit - no changes + if previous_state["lastCommitSha"] == self._config.currentCommit: + print("No changes detected") + return (False, None) + + # Check for force push + is_force_push = self._github.is_force_push( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + if is_force_push: + return (True, "force_push") + + # Get comparison + comparison = self._github.compare_commits( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + # Too many commits + max_commits = self._config.maxCommits or DEFAULT_MAX_COMMITS + if comparison["commits"] > max_commits: + return ( + True, + f"too_many_commits ({comparison['commits']} > {max_commits})", + ) + + # Too many file changes + max_files = self._config.maxFiles or DEFAULT_MAX_FILES + if comparison["totalChanges"] > max_files: + return ( + True, + f"too_many_files ({comparison['totalChanges']} > {max_files})", + ) + + # Check if ignore files changed + ignore_changed = self._github.ignore_files_changed( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + if ignore_changed: + return (True, "ignore_files_changed") + + return (False, None) + + def _full_reindex(self, reason: Optional[str]) -> IndexResult: + """ + Perform full repository re-index. + + Args: + reason: The reason for the full re-index. + + Returns: + IndexResult with the result of the full re-index. + """ + print(f"Performing full re-index (reason: {reason or 'unknown'})") + + # Download entire repository as tarball + files = self._github.download_tarball( + self._config.owner, self._config.repo, self._config.currentCommit + ) + + # Add all files to index + files_to_index = [ + File(path=path, contents=contents) for path, contents in files.items() + ] + + print(f"Adding {len(files_to_index)} files to index...") + self._context.add_to_index(files_to_index) + + # Export DirectContext state + context_state = self._context.export() + context_state_dict = context_state.to_dict() + + new_state: IndexState = { + "contextState": context_state_dict, + "lastCommitSha": self._config.currentCommit, + "repository": RepositoryInfo( + owner=self._config.owner, + name=self._config.repo, + ), + } + + # Save state + self._save_state(new_state) + + return IndexResult( + success=True, + type="full", + filesIndexed=len(files_to_index), + filesDeleted=0, + checkpointId=context_state.checkpoint_id or "", + commitSha=self._config.currentCommit, + reindexReason=reason, + ) + + def _incremental_update(self, previous_state: IndexState) -> IndexResult: + """ + Perform incremental update. + + Args: + previous_state: The previous index state. + + Returns: + IndexResult with the result of the incremental update. + """ + print("Performing incremental update...") + + # Create a temporary file with the previous context state + # Use delete=False because Windows can't reopen a NamedTemporaryFile while it's open + temp_file = tempfile.NamedTemporaryFile( + mode="w", suffix=".json", prefix="github-indexer-incremental-", delete=False + ) + temp_path = Path(temp_file.name) + try: + json.dump(previous_state["contextState"], temp_file, indent=2) + temp_file.close() # Close before reading on Windows + + # Create a new context from the previous state + self._context = DirectContext.import_from_file( + str(temp_path), + api_key=self._config.apiToken, + api_url=self._config.apiUrl, + ) + finally: + temp_path.unlink(missing_ok=True) + + # Get file changes + comparison = self._github.compare_commits( + self._config.owner, + self._config.repo, + previous_state["lastCommitSha"], + self._config.currentCommit, + ) + + # Process changes + files_to_add, files_to_delete = self._process_file_changes(comparison["files"]) + + print(f"Adding {len(files_to_add)} files, deleting {len(files_to_delete)} files") + + # Update index + if files_to_add: + self._context.add_to_index(files_to_add) + + if files_to_delete: + self._context.remove_from_index(files_to_delete) + + # Export DirectContext state + context_state = self._context.export() + context_state_dict = context_state.to_dict() + + new_state: IndexState = { + "contextState": context_state_dict, + "lastCommitSha": self._config.currentCommit, + "repository": previous_state["repository"], + } + + # Save state + self._save_state(new_state) + + return IndexResult( + success=True, + type="incremental", + filesIndexed=len(files_to_add), + filesDeleted=len(files_to_delete), + checkpointId=context_state.checkpoint_id or "", + commitSha=self._config.currentCommit, + ) + + def _process_file_changes( + self, changes: list[FileChange] + ) -> tuple[list[File], list[str]]: + """ + Process file changes and categorize them for indexing. + + Args: + changes: List of file changes from the comparison. + + Returns: + Tuple of (files_to_add, files_to_delete). + """ + files_to_add: list[File] = [] + files_to_delete: list[str] = [] + + for change in changes: + if change.status in ("added", "modified"): + if change.contents: + files_to_add.append( + File(path=change.path, contents=change.contents) + ) + elif change.status == "removed": + files_to_delete.append(change.path) + elif change.status == "renamed": + if change.previousFilename: + files_to_delete.append(change.previousFilename) + if change.contents: + files_to_add.append( + File(path=change.path, contents=change.contents) + ) + + return files_to_add, files_to_delete + diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..fd10065 --- /dev/null +++ b/src/main.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +Main entry point for GitHub Action Indexer + +Usage: + cd examples/python-sdk/context + python -m github_action_indexer index +""" + +import os +import re +import sys + +from auggie_sdk.context import DirectContext + +from .index_manager import IndexManager +from .models import IndexConfig + + +def get_api_credentials() -> tuple[str, str]: + """Get API credentials from environment variables.""" + api_token = os.environ.get("AUGMENT_API_TOKEN") + if not api_token: + raise ValueError("AUGMENT_API_TOKEN environment variable is required") + + api_url = os.environ.get("AUGMENT_API_URL") + if not api_url: + raise ValueError( + "AUGMENT_API_URL environment variable is required. Please set it to your " + "tenant-specific URL (e.g., 'https://your-tenant.api.augmentcode.com/')" + ) + + return api_token, api_url + + +def parse_repository_info() -> tuple[str, str, str, str]: + """ + Parse repository information from environment variables. + Returns (owner, repo, branch, current_commit). + """ + repository = os.environ.get("GITHUB_REPOSITORY", "") + parts = repository.split("/") + + if len(parts) != 2 or not parts[0] or not parts[1]: + raise ValueError('GITHUB_REPOSITORY must be in format "owner/repo"') + + owner, repo = parts + + # Extract branch name from GitHub ref + github_ref = os.environ.get("GITHUB_REF", "") + github_ref_name = os.environ.get("GITHUB_REF_NAME", "") + + if github_ref.startswith("refs/heads/"): + branch = github_ref_name + elif github_ref.startswith("refs/tags/"): + branch = f"tag/{github_ref_name}" + elif github_ref_name: + branch = github_ref_name + else: + branch = os.environ.get("BRANCH", "main") + + current_commit = os.environ.get("GITHUB_SHA", "") + if not current_commit: + raise ValueError("GITHUB_SHA environment variable is required") + + return owner, repo, branch, current_commit + + +def load_config() -> IndexConfig: + """Load configuration from environment variables.""" + github_token = os.environ.get("GITHUB_TOKEN") + if not github_token: + raise ValueError("GITHUB_TOKEN environment variable is required") + + api_token, api_url = get_api_credentials() + owner, repo, branch, current_commit = parse_repository_info() + + max_commits = os.environ.get("MAX_COMMITS") + max_files = os.environ.get("MAX_FILES") + + return IndexConfig( + apiToken=api_token, + apiUrl=api_url, + githubToken=github_token, + owner=owner, + repo=repo, + branch=branch, + currentCommit=current_commit, + maxCommits=int(max_commits) if max_commits else None, + maxFiles=int(max_files) if max_files else None, + ) + + +def get_state_path(branch: str) -> str: + """Get the state file path for the current branch.""" + sanitized_branch = re.sub(r"[^a-zA-Z0-9\-_]", "-", branch) + return os.environ.get( + "STATE_PATH", f".augment-index-state/{sanitized_branch}/state.json" + ) + + +def main() -> None: + """Main function.""" + print("GitHub Action Indexer - Starting...") + + try: + # Load configuration + config = load_config() + state_path = get_state_path(config.branch) + + print(f"Repository: {config.owner}/{config.repo}") + print(f"Branch: {config.branch}") + print(f"Commit ref: {config.currentCommit}") + print(f"State path: {state_path}") + + # Create DirectContext + context = DirectContext.create(api_key=config.apiToken, api_url=config.apiUrl) + + # Create index manager and resolve commit SHA + manager = IndexManager(context, config, state_path) + manager.resolve_commit_sha() + + print(f"Resolved commit SHA: {config.currentCommit}") + + # Perform indexing + result = manager.index() + + # Print results + print("\n=== Indexing Results ===") + print(f"Success: {result.success}") + print(f"Type: {result.type}") + print(f"Files Indexed: {result.filesIndexed}") + print(f"Files Deleted: {result.filesDeleted}") + print(f"Checkpoint ID: {result.checkpointId}") + print(f"Commit SHA: {result.commitSha}") + + if result.reindexReason: + print(f"Re-index Reason: {result.reindexReason}") + + if result.error: + print(f"Error: {result.error}", file=sys.stderr) + sys.exit(1) + + # Set GitHub Actions output + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + output_lines = [ + f"success={result.success}", + f"type={result.type}", + f"files_indexed={result.filesIndexed}", + f"files_deleted={result.filesDeleted}", + f"checkpoint_id={result.checkpointId}", + f"commit_sha={result.commitSha}", + ] + with open(github_output, "a") as f: + f.write("\n".join(output_lines) + "\n") + + print("\nIndexing completed successfully!") + + except Exception as error: + print(f"Fatal error: {error}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..8b3dfc0 --- /dev/null +++ b/src/models.py @@ -0,0 +1,131 @@ +""" +Types for the GitHub Action Indexer + +This module defines the data types used by the GitHub Action Indexer +for tracking index state, file changes, configuration, and results. +""" + +from dataclasses import dataclass +from typing import Literal, Optional + +from typing_extensions import TypedDict + +from auggie_sdk.context.models import DirectContextState + + +class RepositoryInfo(TypedDict): + """Repository information for index state.""" + + owner: str # Repository owner + name: str # Repository name + + +class IndexState(TypedDict): + """ + Persistent state for the GitHub Action Indexer. + + This state is stored between indexing runs to enable incremental indexing. + """ + + contextState: DirectContextState + """DirectContext state (checkpoint, blobs, etc.)""" + + lastCommitSha: str + """Last indexed commit SHA (must be a full 40-character SHA, not a ref like 'HEAD')""" + + repository: RepositoryInfo + """Repository information - used to verify we're indexing the same repository""" + + +@dataclass +class FileChange: + """ + Represents a file change detected between commits. + + Used to track what files need to be indexed or removed from the index. + """ + + path: str + """File path""" + + status: Literal["added", "modified", "removed", "renamed"] + """Change status: added, modified, removed, renamed""" + + previousFilename: Optional[str] = None + """Previous filename (for renames)""" + + contents: Optional[str] = None + """File contents (for added/modified files)""" + + oldBlobName: Optional[str] = None + """Blob name from previous index (for modified/removed files)""" + + +@dataclass +class IndexConfig: + """ + Configuration for the GitHub Action Indexer. + + Contains all the settings needed to perform indexing of a GitHub repository. + """ + + apiToken: str + """Augment API token""" + + apiUrl: str + """Augment API URL (provided via AUGMENT_API_URL env var)""" + + githubToken: str + """GitHub token""" + + owner: str + """Repository owner""" + + repo: str + """Repository name""" + + branch: str + """Branch to index""" + + currentCommit: str + """Current commit SHA""" + + maxCommits: Optional[int] = None + """Maximum commits before full re-index""" + + maxFiles: Optional[int] = None + """Maximum file changes before full re-index""" + + +@dataclass +class IndexResult: + """ + Result from an indexing operation. + + Contains information about what was indexed and whether it was successful. + """ + + success: bool + """Whether indexing was successful""" + + type: Literal["full", "incremental", "no-changes"] + """Type of indexing performed""" + + filesIndexed: int + """Number of files indexed""" + + filesDeleted: int + """Number of files deleted""" + + checkpointId: str + """New checkpoint ID""" + + commitSha: str + """Commit SHA that was indexed""" + + error: Optional[str] = None + """Error message if failed""" + + reindexReason: Optional[str] = None + """Reason for full re-index (if applicable)""" + diff --git a/src/search.py b/src/search.py new file mode 100644 index 0000000..fdac426 --- /dev/null +++ b/src/search.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +CLI tool to search the indexed repository + +Usage: + cd examples/python-sdk/context + python -m github_action_indexer search "your search query" + python -m github_action_indexer search "your search query" --max-chars 5000 +""" + +import argparse +import json +import os +import re +import sys +import tempfile +from pathlib import Path +from typing import Optional + +from auggie_sdk.context import DirectContext + +from .models import IndexState + + +def get_state_path() -> str: + """Get the state file path for the current branch.""" + branch = os.environ.get("BRANCH", "main") + sanitized_branch = re.sub(r"[^a-zA-Z0-9\-_]", "-", branch) + return os.environ.get( + "STATE_PATH", f".augment-index-state/{sanitized_branch}/state.json" + ) + + +def load_state(state_path: str) -> Optional[IndexState]: + """Load index state from file system.""" + try: + with open(state_path, "r") as f: + data = f.read() + return json.loads(data) + except FileNotFoundError: + return None + + +def main() -> None: + """Main search function.""" + # Parse command line arguments + parser = argparse.ArgumentParser( + description="Search the indexed repository", + epilog='Example: python search.py "authentication functions"', + ) + parser.add_argument("query", help="Search query") + parser.add_argument( + "--max-chars", + type=int, + help="Maximum number of characters in output", + dest="max_chars", + ) + args = parser.parse_args() + + # Get API credentials + api_token = os.environ.get("AUGMENT_API_TOKEN") + if not api_token: + print("Error: AUGMENT_API_TOKEN environment variable is required", file=sys.stderr) + sys.exit(1) + + api_url = os.environ.get("AUGMENT_API_URL") + if not api_url: + print( + "Error: AUGMENT_API_URL environment variable is required. Please set it to your " + "tenant-specific URL (e.g., 'https://your-tenant.api.augmentcode.com/')", + file=sys.stderr, + ) + sys.exit(1) + + print(f'Searching for: "{args.query}"') + if args.max_chars is not None: + print(f"Limiting results to max {args.max_chars} characters\n") + else: + print() + + try: + # Load the index state first + state_path = get_state_path() + print(f"Loading index state from: {state_path}") + state = load_state(state_path) + + if not state: + print("Error: No index state found. Run indexing first.", file=sys.stderr) + print(" python -m github_action_indexer index", file=sys.stderr) + sys.exit(1) + + # Create a temporary file with the context state for import + # Use delete=False because Windows can't reopen a NamedTemporaryFile while it's open + temp_file = tempfile.NamedTemporaryFile( + mode="w", suffix=".json", prefix="github-indexer-state-", delete=False + ) + temp_path = Path(temp_file.name) + try: + json.dump(state["contextState"], temp_file, indent=2) + temp_file.close() # Close before reading on Windows + + # Import state using DirectContext.import_from_file + context = DirectContext.import_from_file( + str(temp_path), api_key=api_token, api_url=api_url + ) + finally: + temp_path.unlink(missing_ok=True) + + file_count = len(state["contextState"].get("blobs", [])) + + print(f"Loaded index: {file_count} files indexed") + print(f"Repository: {state['repository']['owner']}/{state['repository']['name']}") + print(f"Last indexed commit: {state['lastCommitSha']}\n") + + # Perform search with optional character limit + results = context.search(args.query, max_output_length=args.max_chars) + + if not results or results.strip() == "": + print("No results found.") + return + + print("Search results:\n") + print(results) + + except Exception as error: + print(f"Search failed: {error}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() +