Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions .github/workflows/index.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: Index Repository

on:
push:
branches:
- main
- develop
- 'feature/**' # Index feature branches
- 'release/**' # Index release branches
workflow_dispatch:
inputs:
branch:
description: 'Branch to index (leave empty for current branch)'
required: false
type: string
force_full_reindex:
description: 'Force full re-index'
required: false
type: boolean
default: false

jobs:
index:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0 # Full history for comparison

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install dependencies
run: pip install -r requirements.txt

- name: Restore index state
uses: actions/cache@v4
with:
path: .augment-index-state
# Use branch-specific cache key
key: augment-index-${{ github.ref_name }}-${{ github.sha }}
restore-keys: |
augment-index-${{ github.ref_name }}-

- name: Index repository
id: index
run: python src/main.py
env:
AUGMENT_API_TOKEN: ${{ secrets.AUGMENT_API_TOKEN }}
AUGMENT_API_URL: ${{ secrets.AUGMENT_API_URL }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
STORAGE_TYPE: file
# Branch-specific state path (automatically determined from GITHUB_REF)
# STATE_PATH is optional - defaults to .augment-index-state/{branch}/state.json
MAX_COMMITS: 100
MAX_FILES: 500

- name: Print results
if: always()
run: |
echo "Success: ${{ steps.index.outputs.success }}"
echo "Type: ${{ steps.index.outputs.type }}"
echo "Files Indexed: ${{ steps.index.outputs.files_indexed }}"
echo "Files Deleted: ${{ steps.index.outputs.files_deleted }}"
echo "Checkpoint ID: ${{ steps.index.outputs.checkpoint_id }}"
echo "Commit SHA: ${{ steps.index.outputs.commit_sha }}"

- name: Upload state artifact
if: success()
uses: actions/upload-artifact@v4
with:
name: index-state
path: .augment-index-state/
retention-days: 30

24 changes: 24 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
GitHub Action Repository Indexer

A Python example showing how to index a GitHub repository using the Augment SDK
Direct Mode with incremental updates.

See README.md for usage instructions.
"""

from .models import FileChange, IndexConfig, IndexResult, IndexState
from .file_filter import should_filter_file
from .github_client import GitHubClient
from .index_manager import IndexManager

__all__ = [
"FileChange",
"IndexConfig",
"IndexResult",
"IndexState",
"should_filter_file",
"GitHubClient",
"IndexManager",
]

123 changes: 123 additions & 0 deletions src/file_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""
File filtering logic for GitHub repository indexing.
"""

import re
from pathlib import Path
from typing import Optional

# Keyish pattern regex - matches files that likely contain secrets/keys
KEYISH_PATTERN = re.compile(
r'^(\.git|.*\.pem|.*\.key|.*\.pfx|.*\.p12|.*\.jks|.*\.keystore|.*\.pkcs12|.*\.crt|.*\.cer|id_rsa|id_ed25519|id_ecdsa|id_dsa)$'
)

# Default max file size in bytes (1 MB)
DEFAULT_MAX_FILE_SIZE = 1024 * 1024 # 1 MB


def always_ignore_path(path: str) -> bool:
"""
Check if a path should always be ignored (security measure).

Args:
path: The file path to check.

Returns:
True if the path contains ".." and should be ignored.
"""
return ".." in path


def is_keyish_path(path: str) -> bool:
"""
Check if a path matches the keyish pattern (secrets/keys).

Args:
path: The file path to check.

Returns:
True if the filename matches patterns for secret/key files.
"""
# Extract filename from path
filename = Path(path).name
return bool(KEYISH_PATTERN.match(filename))


def is_valid_file_size(size_bytes: int, max_file_size: int = DEFAULT_MAX_FILE_SIZE) -> bool:
"""
Check if file size is valid for upload.

Args:
size_bytes: The size of the file in bytes.
max_file_size: Maximum allowed file size in bytes. Defaults to 1 MB.

Returns:
True if the file size is within the allowed limit.
"""
return size_bytes <= max_file_size


def is_valid_utf8(content: bytes) -> bool:
"""
Check if file content is valid UTF-8 (not binary).

Args:
content: The file content as bytes.

Returns:
True if the content is valid UTF-8, False if it's binary or invalid.
"""
try:
content.decode("utf-8")
return True
except UnicodeDecodeError:
return False


def should_filter_file(
path: str,
content: bytes,
max_file_size: Optional[int] = None,
) -> dict:
"""
Check if a file should be filtered out.

Returns {"filtered": True, "reason": "..."} if file should be skipped.
Returns {"filtered": False} if file should be included.

Priority order (from file-filtering.md):
1. Path validation (contains "..")
2. File size check
3. .augmentignore rules (checked by caller)
4. Keyish patterns
5. .gitignore rules (checked by caller)
6. UTF-8 validation

Args:
path: The file path to check.
content: The file content as bytes.
max_file_size: Maximum allowed file size in bytes. Defaults to DEFAULT_MAX_FILE_SIZE.

Returns:
A dict with "filtered" (bool) and optionally "reason" (str) keys.
"""
effective_max_size = max_file_size if max_file_size is not None else DEFAULT_MAX_FILE_SIZE

# 1. Check for ".." in path (security)
if always_ignore_path(path):
return {"filtered": True, "reason": "path_contains_dotdot"}

# 2. Check file size
if not is_valid_file_size(len(content), effective_max_size):
return {"filtered": True, "reason": f"file_too_large ({len(content)} bytes)"}

# 3. Check keyish patterns (secrets/keys)
if is_keyish_path(path):
return {"filtered": True, "reason": "keyish_pattern"}

# 4. Check UTF-8 validity (binary detection)
if not is_valid_utf8(content):
return {"filtered": True, "reason": "binary_file"}

return {"filtered": False}

Loading
Loading