From 793ece49bbbc851e467938299af3c214b30584a8 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 10 Dec 2025 21:15:58 -0500 Subject: [PATCH 1/8] Add indexed_branches table for multi-branch support This enables future multi-branch indexing by tracking branch state across indexing operations. Signed-off-by: Sasha Levin --- docs/schema.md | 36 ++- src/database/branches.rs | 619 +++++++++++++++++++++++++++++++++++++ src/database/connection.rs | 78 +++++ src/database/mod.rs | 1 + src/database/schema.rs | 57 ++++ 5 files changed, 789 insertions(+), 2 deletions(-) create mode 100644 src/database/branches.rs diff --git a/docs/schema.md b/docs/schema.md index 82901d0..a550380 100644 --- a/docs/schema.md +++ b/docs/schema.md @@ -36,7 +36,8 @@ The database consists of the following tables: 7. **git_commits** - Git commit metadata with unified diffs and changed symbols 8. **lore** - Lore.kernel.org email archive with FTS indices for fast searching 9. **lore_vectors** - Vector embeddings for semantic search of lore emails -10. **content_0 through content_15** - Deduplicated content storage (16 shards) +10. **indexed_branches** - Tracks which git branches have been indexed with their tip commits +11. **content_0 through content_15** - Deduplicated content storage (16 shards) ## Table Schemas @@ -354,7 +355,38 @@ Embeddings combine from, subject, recipients, and body into a single representat - IVF-PQ vector index for fast approximate nearest neighbor search -### 10. content_0 through content_15 (Content Shards) +### 10. indexed_branches + +Tracks which git branches have been indexed, enabling multi-branch support and efficient incremental indexing across branches. + +**Schema:** +``` +branch_name (Utf8, NOT NULL) - Branch name (e.g., "main", "origin/develop") +tip_commit (Utf8, NOT NULL) - Commit SHA at the tip when indexed (40-char hex) +indexed_at (Int64, NOT NULL) - Unix timestamp of when branch was last indexed +remote (Utf8, nullable) - Remote name if tracking branch (e.g., "origin") +``` + +**Purpose:** +- Tracks which branches have been indexed and at which commit +- Enables efficient multi-branch indexing by skipping already-current branches +- Supports both local branches (e.g., "main") and remote-tracking branches (e.g., "origin/develop") +- Stores indexing timestamp for freshness tracking + +**Use Cases:** +- Multi-branch indexing: `semcode-index --branches main,develop,feature-x` +- Branch update detection: Skip branches already indexed at current tip +- Query scoping: Limit queries to specific branch context +- Branch cleanup: Remove data for deleted branches + +**Indices:** +- BTree on `branch_name` (primary lookup by branch name) +- BTree on `tip_commit` (find branches at specific commits) +- BTree on `remote` (filter by remote) + +--- + +### 11. content_0 through content_15 (Content Shards) Stores deduplicated content referenced by other tables, distributed across 16 shard tables for optimal performance. diff --git a/src/database/branches.rs b/src/database/branches.rs new file mode 100644 index 0000000..0949cd9 --- /dev/null +++ b/src/database/branches.rs @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: MIT OR Apache-2.0 +//! Store for tracking indexed git branches. +//! +//! This module provides functionality to track which branches have been indexed, +//! their tip commits, and when they were last indexed. This enables efficient +//! multi-branch indexing by avoiding re-indexing branches that haven't changed. + +use anyhow::Result; +use arrow::array::{Array, ArrayRef, Int64Array, RecordBatch, StringBuilder}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatchIterator; +use futures::TryStreamExt; +use lancedb::connection::Connection; +use lancedb::query::{ExecutableQuery, QueryBase}; +use std::sync::Arc; + +/// Information about an indexed branch +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct IndexedBranchInfo { + /// Branch name (e.g., "main", "origin/develop") + pub branch_name: String, + /// The commit SHA at the tip of the branch when indexed + pub tip_commit: String, + /// Unix timestamp of when the branch was last indexed + pub indexed_at: i64, + /// Remote name if this is a remote-tracking branch (e.g., "origin") + pub remote: Option, +} + +/// JSON-serializable version of IndexedBranchInfo +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct IndexedBranchInfoJson { + pub branch_name: String, + pub tip_commit: String, + pub indexed_at: i64, + pub remote: Option, +} + +impl From for IndexedBranchInfoJson { + fn from(info: IndexedBranchInfo) -> Self { + IndexedBranchInfoJson { + branch_name: info.branch_name, + tip_commit: info.tip_commit, + indexed_at: info.indexed_at, + remote: info.remote, + } + } +} + +impl From for IndexedBranchInfo { + fn from(json: IndexedBranchInfoJson) -> Self { + IndexedBranchInfo { + branch_name: json.branch_name, + tip_commit: json.tip_commit, + indexed_at: json.indexed_at, + remote: json.remote, + } + } +} + +/// Store for managing indexed branch records +pub struct IndexedBranchStore { + connection: Connection, +} + +impl IndexedBranchStore { + pub fn new(connection: Connection) -> Self { + Self { connection } + } + + /// Get the Arrow schema for the indexed_branches table + pub fn get_schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("branch_name", DataType::Utf8, false), + Field::new("tip_commit", DataType::Utf8, false), + Field::new("indexed_at", DataType::Int64, false), + Field::new("remote", DataType::Utf8, true), + ])) + } + + /// Record that a branch has been indexed at a specific commit + pub async fn record_branch_indexed(&self, info: &IndexedBranchInfo) -> Result<()> { + // First, remove any existing record for this branch + self.remove_branch(&info.branch_name).await?; + + let table = self + .connection + .open_table("indexed_branches") + .execute() + .await?; + + // Build arrays for each column + let mut branch_name_builder = StringBuilder::new(); + let mut tip_commit_builder = StringBuilder::new(); + let mut indexed_at_builder = arrow::array::Int64Builder::new(); + let mut remote_builder = StringBuilder::new(); + + branch_name_builder.append_value(&info.branch_name); + tip_commit_builder.append_value(&info.tip_commit); + indexed_at_builder.append_value(info.indexed_at); + match &info.remote { + Some(r) => remote_builder.append_value(r), + None => remote_builder.append_null(), + } + + let schema = Self::get_schema(); + + let batch = RecordBatch::try_from_iter(vec![ + ( + "branch_name", + Arc::new(branch_name_builder.finish()) as ArrayRef, + ), + ( + "tip_commit", + Arc::new(tip_commit_builder.finish()) as ArrayRef, + ), + ( + "indexed_at", + Arc::new(indexed_at_builder.finish()) as ArrayRef, + ), + ("remote", Arc::new(remote_builder.finish()) as ArrayRef), + ])?; + + let batches = vec![Ok(batch)]; + let batch_iterator = RecordBatchIterator::new(batches.into_iter(), schema); + table.add(batch_iterator).execute().await?; + + Ok(()) + } + + /// Get the tip commit for a specific branch + pub async fn get_branch_tip(&self, branch_name: &str) -> Result> { + let info = self.get_branch_info(branch_name).await?; + Ok(info.map(|i| i.tip_commit)) + } + + /// Get full information about a specific branch + pub async fn get_branch_info(&self, branch_name: &str) -> Result> { + let table = self + .connection + .open_table("indexed_branches") + .execute() + .await?; + + let escaped_name = branch_name.replace("'", "''"); + let filter = format!("branch_name = '{escaped_name}'"); + + let results = table + .query() + .only_if(filter) + .limit(1) + .execute() + .await? + .try_collect::>() + .await?; + + if results.is_empty() || results[0].num_rows() == 0 { + return Ok(None); + } + + self.extract_record_from_batch(&results[0], 0) + } + + /// List all indexed branches + pub async fn list_indexed_branches(&self) -> Result> { + let table = self + .connection + .open_table("indexed_branches") + .execute() + .await?; + + let results = table + .query() + .execute() + .await? + .try_collect::>() + .await?; + + let mut branches = Vec::new(); + for batch in &results { + for i in 0..batch.num_rows() { + if let Some(info) = self.extract_record_from_batch(batch, i)? { + branches.push(info); + } + } + } + + // Sort by branch name for consistent output + branches.sort_by(|a, b| a.branch_name.cmp(&b.branch_name)); + + Ok(branches) + } + + /// Check if a branch is indexed at the current tip commit + pub async fn is_branch_current(&self, branch_name: &str, current_tip: &str) -> Result { + if let Some(info) = self.get_branch_info(branch_name).await? { + Ok(info.tip_commit == current_tip) + } else { + Ok(false) + } + } + + /// Remove a branch record (used when branch is deleted or before updating) + pub async fn remove_branch(&self, branch_name: &str) -> Result<()> { + let table = self + .connection + .open_table("indexed_branches") + .execute() + .await?; + + let escaped_name = branch_name.replace("'", "''"); + let filter = format!("branch_name = '{escaped_name}'"); + + table.delete(&filter).await?; + Ok(()) + } + + /// Remove all branches that match a remote prefix (e.g., "origin/") + pub async fn remove_branches_by_remote(&self, remote: &str) -> Result { + let branches = self.list_indexed_branches().await?; + let mut removed = 0; + + for branch in branches { + if branch.remote.as_deref() == Some(remote) { + self.remove_branch(&branch.branch_name).await?; + removed += 1; + } + } + + Ok(removed) + } + + /// Get all branches that point to a specific commit + pub async fn get_branches_at_commit(&self, commit_sha: &str) -> Result> { + let table = self + .connection + .open_table("indexed_branches") + .execute() + .await?; + + let escaped_sha = commit_sha.replace("'", "''"); + let filter = format!("tip_commit = '{escaped_sha}'"); + + let results = table + .query() + .only_if(filter) + .execute() + .await? + .try_collect::>() + .await?; + + let mut branches = Vec::new(); + for batch in &results { + for i in 0..batch.num_rows() { + if let Some(info) = self.extract_record_from_batch(batch, i)? { + branches.push(info); + } + } + } + + Ok(branches) + } + + /// Get total count of indexed branches + pub async fn count(&self) -> Result { + let table = self + .connection + .open_table("indexed_branches") + .execute() + .await?; + Ok(table.count_rows(None).await?) + } + + /// Extract an IndexedBranchInfo from a batch at the given row index + fn extract_record_from_batch( + &self, + batch: &RecordBatch, + row: usize, + ) -> Result> { + let branch_name_array = batch + .column_by_name("branch_name") + .ok_or_else(|| anyhow::anyhow!("Missing branch_name column"))? + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow::anyhow!("Invalid branch_name column type"))?; + + let tip_commit_array = batch + .column_by_name("tip_commit") + .ok_or_else(|| anyhow::anyhow!("Missing tip_commit column"))? + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow::anyhow!("Invalid tip_commit column type"))?; + + let indexed_at_array = batch + .column_by_name("indexed_at") + .ok_or_else(|| anyhow::anyhow!("Missing indexed_at column"))? + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow::anyhow!("Invalid indexed_at column type"))?; + + let remote_array = batch + .column_by_name("remote") + .ok_or_else(|| anyhow::anyhow!("Missing remote column"))? + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow::anyhow!("Invalid remote column type"))?; + + let branch_name = branch_name_array.value(row).to_string(); + let tip_commit = tip_commit_array.value(row).to_string(); + let indexed_at = indexed_at_array.value(row); + let remote = if remote_array.is_null(row) { + None + } else { + Some(remote_array.value(row).to_string()) + }; + + Ok(Some(IndexedBranchInfo { + branch_name, + tip_commit, + indexed_at, + remote, + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + async fn create_test_store() -> (TempDir, IndexedBranchStore) { + let tmpdir = TempDir::new().unwrap(); + let db_path = tmpdir.path().to_str().unwrap(); + let connection = lancedb::connect(db_path).execute().await.unwrap(); + + // Create the table + let schema = IndexedBranchStore::get_schema(); + let empty_batch = RecordBatch::new_empty(schema.clone()); + let batches = vec![Ok(empty_batch)]; + let batch_iterator = RecordBatchIterator::new(batches.into_iter(), schema); + connection + .create_table("indexed_branches", batch_iterator) + .execute() + .await + .unwrap(); + + let store = IndexedBranchStore::new(connection); + (tmpdir, store) + } + + #[tokio::test] + async fn test_record_and_get_branch() { + let (_tmpdir, store) = create_test_store().await; + + let info = IndexedBranchInfo { + branch_name: "main".to_string(), + tip_commit: "abc123def456".to_string(), + indexed_at: 1699900000, + remote: None, + }; + + store.record_branch_indexed(&info).await.unwrap(); + + let retrieved = store.get_branch_info("main").await.unwrap(); + assert!(retrieved.is_some()); + let retrieved = retrieved.unwrap(); + assert_eq!(retrieved.branch_name, "main"); + assert_eq!(retrieved.tip_commit, "abc123def456"); + assert_eq!(retrieved.indexed_at, 1699900000); + assert!(retrieved.remote.is_none()); + } + + #[tokio::test] + async fn test_record_remote_branch() { + let (_tmpdir, store) = create_test_store().await; + + let info = IndexedBranchInfo { + branch_name: "origin/develop".to_string(), + tip_commit: "789abc123".to_string(), + indexed_at: 1699900100, + remote: Some("origin".to_string()), + }; + + store.record_branch_indexed(&info).await.unwrap(); + + let retrieved = store.get_branch_info("origin/develop").await.unwrap(); + assert!(retrieved.is_some()); + let retrieved = retrieved.unwrap(); + assert_eq!(retrieved.remote, Some("origin".to_string())); + } + + #[tokio::test] + async fn test_update_branch() { + let (_tmpdir, store) = create_test_store().await; + + // Record initial version + let info1 = IndexedBranchInfo { + branch_name: "main".to_string(), + tip_commit: "commit1".to_string(), + indexed_at: 1699900000, + remote: None, + }; + store.record_branch_indexed(&info1).await.unwrap(); + + // Update with new commit + let info2 = IndexedBranchInfo { + branch_name: "main".to_string(), + tip_commit: "commit2".to_string(), + indexed_at: 1699900100, + remote: None, + }; + store.record_branch_indexed(&info2).await.unwrap(); + + // Should have only one record with the new commit + let retrieved = store.get_branch_info("main").await.unwrap().unwrap(); + assert_eq!(retrieved.tip_commit, "commit2"); + assert_eq!(retrieved.indexed_at, 1699900100); + + // Count should be 1 + assert_eq!(store.count().await.unwrap(), 1); + } + + #[tokio::test] + async fn test_list_indexed_branches() { + let (_tmpdir, store) = create_test_store().await; + + let branches = vec![ + IndexedBranchInfo { + branch_name: "main".to_string(), + tip_commit: "commit1".to_string(), + indexed_at: 1699900000, + remote: None, + }, + IndexedBranchInfo { + branch_name: "develop".to_string(), + tip_commit: "commit2".to_string(), + indexed_at: 1699900100, + remote: None, + }, + IndexedBranchInfo { + branch_name: "origin/feature".to_string(), + tip_commit: "commit3".to_string(), + indexed_at: 1699900200, + remote: Some("origin".to_string()), + }, + ]; + + for info in &branches { + store.record_branch_indexed(info).await.unwrap(); + } + + let listed = store.list_indexed_branches().await.unwrap(); + assert_eq!(listed.len(), 3); + + // Should be sorted by name + assert_eq!(listed[0].branch_name, "develop"); + assert_eq!(listed[1].branch_name, "main"); + assert_eq!(listed[2].branch_name, "origin/feature"); + } + + #[tokio::test] + async fn test_is_branch_current() { + let (_tmpdir, store) = create_test_store().await; + + let info = IndexedBranchInfo { + branch_name: "main".to_string(), + tip_commit: "abc123".to_string(), + indexed_at: 1699900000, + remote: None, + }; + store.record_branch_indexed(&info).await.unwrap(); + + assert!(store.is_branch_current("main", "abc123").await.unwrap()); + assert!(!store.is_branch_current("main", "def456").await.unwrap()); + assert!(!store + .is_branch_current("nonexistent", "abc123") + .await + .unwrap()); + } + + #[tokio::test] + async fn test_remove_branch() { + let (_tmpdir, store) = create_test_store().await; + + let info = IndexedBranchInfo { + branch_name: "feature".to_string(), + tip_commit: "abc123".to_string(), + indexed_at: 1699900000, + remote: None, + }; + store.record_branch_indexed(&info).await.unwrap(); + + assert!(store.get_branch_info("feature").await.unwrap().is_some()); + + store.remove_branch("feature").await.unwrap(); + + assert!(store.get_branch_info("feature").await.unwrap().is_none()); + } + + #[tokio::test] + async fn test_remove_branches_by_remote() { + let (_tmpdir, store) = create_test_store().await; + + let branches = vec![ + IndexedBranchInfo { + branch_name: "main".to_string(), + tip_commit: "commit1".to_string(), + indexed_at: 1699900000, + remote: None, + }, + IndexedBranchInfo { + branch_name: "origin/main".to_string(), + tip_commit: "commit2".to_string(), + indexed_at: 1699900100, + remote: Some("origin".to_string()), + }, + IndexedBranchInfo { + branch_name: "origin/develop".to_string(), + tip_commit: "commit3".to_string(), + indexed_at: 1699900200, + remote: Some("origin".to_string()), + }, + IndexedBranchInfo { + branch_name: "upstream/main".to_string(), + tip_commit: "commit4".to_string(), + indexed_at: 1699900300, + remote: Some("upstream".to_string()), + }, + ]; + + for info in &branches { + store.record_branch_indexed(info).await.unwrap(); + } + + let removed = store.remove_branches_by_remote("origin").await.unwrap(); + assert_eq!(removed, 2); + + let remaining = store.list_indexed_branches().await.unwrap(); + assert_eq!(remaining.len(), 2); + assert!(remaining.iter().any(|b| b.branch_name == "main")); + assert!(remaining.iter().any(|b| b.branch_name == "upstream/main")); + } + + #[tokio::test] + async fn test_get_branches_at_commit() { + let (_tmpdir, store) = create_test_store().await; + + let shared_commit = "shared123"; + let branches = vec![ + IndexedBranchInfo { + branch_name: "main".to_string(), + tip_commit: shared_commit.to_string(), + indexed_at: 1699900000, + remote: None, + }, + IndexedBranchInfo { + branch_name: "release".to_string(), + tip_commit: shared_commit.to_string(), + indexed_at: 1699900100, + remote: None, + }, + IndexedBranchInfo { + branch_name: "develop".to_string(), + tip_commit: "different456".to_string(), + indexed_at: 1699900200, + remote: None, + }, + ]; + + for info in &branches { + store.record_branch_indexed(info).await.unwrap(); + } + + let at_shared = store.get_branches_at_commit(shared_commit).await.unwrap(); + assert_eq!(at_shared.len(), 2); + assert!(at_shared.iter().any(|b| b.branch_name == "main")); + assert!(at_shared.iter().any(|b| b.branch_name == "release")); + } + + #[tokio::test] + async fn test_get_branch_tip() { + let (_tmpdir, store) = create_test_store().await; + + let info = IndexedBranchInfo { + branch_name: "main".to_string(), + tip_commit: "abc123".to_string(), + indexed_at: 1699900000, + remote: None, + }; + store.record_branch_indexed(&info).await.unwrap(); + + assert_eq!( + store.get_branch_tip("main").await.unwrap(), + Some("abc123".to_string()) + ); + assert_eq!(store.get_branch_tip("nonexistent").await.unwrap(), None); + } + + #[tokio::test] + async fn test_branch_name_with_special_chars() { + let (_tmpdir, store) = create_test_store().await; + + // Test branch names with characters that need escaping + let info = IndexedBranchInfo { + branch_name: "feature/user's-branch".to_string(), + tip_commit: "abc123".to_string(), + indexed_at: 1699900000, + remote: None, + }; + store.record_branch_indexed(&info).await.unwrap(); + + let retrieved = store + .get_branch_info("feature/user's-branch") + .await + .unwrap(); + assert!(retrieved.is_some()); + assert_eq!(retrieved.unwrap().branch_name, "feature/user's-branch"); + } +} diff --git a/src/database/connection.rs b/src/database/connection.rs index 409bc5d..a56938f 100644 --- a/src/database/connection.rs +++ b/src/database/connection.rs @@ -9,6 +9,7 @@ use lancedb::index::scalar::FullTextSearchQuery; use lancedb::query::ExecutableQuery; use lancedb::query::QueryBase; +use crate::database::branches::IndexedBranchStore; use crate::database::functions::FunctionStore; use crate::database::schema::SchemaManager; use crate::database::search::{SearchManager, VectorSearchManager}; @@ -36,6 +37,7 @@ pub struct DatabaseManager { processed_file_store: ProcessedFileStore, content_store: ContentStore, symbol_filename_store: SymbolFilenameStore, + branch_store: IndexedBranchStore, } impl DatabaseManager { @@ -54,6 +56,7 @@ impl DatabaseManager { processed_file_store: ProcessedFileStore::new(connection.clone()), content_store: ContentStore::new(connection.clone()), symbol_filename_store: SymbolFilenameStore::new(connection.clone()), + branch_store: IndexedBranchStore::new(connection.clone()), }) } @@ -79,6 +82,7 @@ impl DatabaseManager { "symbol_filename", "git_commits", "lore", + "indexed_branches", ] { if let Ok(table) = self.connection.open_table(*table_name).execute().await { table.delete("1=1").await?; @@ -2498,6 +2502,80 @@ impl DatabaseManager { self.processed_file_store.get_all_file_git_sha_pairs().await } + // ==================== Branch Management ==================== + + /// Record that a branch has been indexed at a specific commit + pub async fn record_branch_indexed( + &self, + branch_name: &str, + tip_commit: &str, + remote: Option<&str>, + ) -> Result<()> { + use crate::database::branches::IndexedBranchInfo; + let info = IndexedBranchInfo { + branch_name: branch_name.to_string(), + tip_commit: tip_commit.to_string(), + indexed_at: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() as i64, + remote: remote.map(|s| s.to_string()), + }; + self.branch_store.record_branch_indexed(&info).await + } + + /// Get the tip commit for a specific branch + pub async fn get_branch_tip(&self, branch_name: &str) -> Result> { + self.branch_store.get_branch_tip(branch_name).await + } + + /// Get full information about a specific indexed branch + pub async fn get_indexed_branch_info( + &self, + branch_name: &str, + ) -> Result> { + self.branch_store.get_branch_info(branch_name).await + } + + /// List all indexed branches + pub async fn list_indexed_branches( + &self, + ) -> Result> { + self.branch_store.list_indexed_branches().await + } + + /// Check if a branch is indexed at the current tip commit + pub async fn is_branch_current(&self, branch_name: &str, current_tip: &str) -> Result { + self.branch_store + .is_branch_current(branch_name, current_tip) + .await + } + + /// Remove a branch record (used when branch is deleted) + pub async fn remove_indexed_branch(&self, branch_name: &str) -> Result<()> { + self.branch_store.remove_branch(branch_name).await + } + + /// Remove all branches for a specific remote + pub async fn remove_branches_by_remote(&self, remote: &str) -> Result { + self.branch_store.remove_branches_by_remote(remote).await + } + + /// Get all branches that point to a specific commit + pub async fn get_branches_at_commit( + &self, + commit_sha: &str, + ) -> Result> { + self.branch_store.get_branches_at_commit(commit_sha).await + } + + /// Get count of indexed branches + pub async fn get_indexed_branch_count(&self) -> Result { + self.branch_store.count().await + } + + // ==================== End Branch Management ==================== + pub async fn get_existing_function_names(&self) -> Result> { use futures::TryStreamExt; diff --git a/src/database/mod.rs b/src/database/mod.rs index 2b9ac1f..dec2ef2 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -1,4 +1,5 @@ // SPDX-License-Identifier: MIT OR Apache-2.0 +pub mod branches; pub mod calls; mod connection; pub mod content; diff --git a/src/database/schema.rs b/src/database/schema.rs index b7e06c2..e5377f0 100644 --- a/src/database/schema.rs +++ b/src/database/schema.rs @@ -58,6 +58,10 @@ impl SchemaManager { self.create_lore_vectors_table().await?; } + if !table_names.iter().any(|n| n == "indexed_branches") { + self.create_indexed_branches_table().await?; + } + // Check and create content shard tables (content_0 through content_15) self.create_content_shard_tables().await?; @@ -277,6 +281,23 @@ impl SchemaManager { Ok(()) } + async fn create_indexed_branches_table(&self) -> Result<()> { + use crate::database::branches::IndexedBranchStore; + + let schema = IndexedBranchStore::get_schema(); + let empty_batch = RecordBatch::new_empty(schema.clone()); + let batches = vec![Ok(empty_batch)]; + let batch_iterator = RecordBatchIterator::new(batches.into_iter(), schema); + + self.connection + .create_table("indexed_branches", batch_iterator) + .execute() + .await?; + + tracing::info!("Created indexed_branches table for multi-branch support"); + Ok(()) + } + async fn create_content_table(&self) -> Result<()> { let schema = Arc::new(Schema::new(vec![ Field::new("blake3_hash", DataType::Utf8, false), // Blake3 hash of content as hex string @@ -656,6 +677,39 @@ impl SchemaManager { // Note: BTree indices on body, recipients, and symbols removed - FTS used instead } + // Create indices for indexed_branches table + if table_names.iter().any(|n| n == "indexed_branches") { + let table = self + .connection + .open_table("indexed_branches") + .execute() + .await?; + + // Primary index on branch_name for fast branch lookups + self.try_create_index( + &table, + &["branch_name"], + "BTree index on indexed_branches.branch_name", + ) + .await; + + // Index on tip_commit for finding branches at specific commits + self.try_create_index( + &table, + &["tip_commit"], + "BTree index on indexed_branches.tip_commit", + ) + .await; + + // Index on remote for remote-based queries + self.try_create_index( + &table, + &["remote"], + "BTree index on indexed_branches.remote", + ) + .await; + } + // Create indices for all content shard tables for shard in 0..16u8 { let table_name = format!("content_{shard}"); @@ -820,6 +874,7 @@ impl SchemaManager { "symbol_filename", "git_commits", "lore", + "indexed_branches", ]; // Add all content shard tables @@ -999,6 +1054,7 @@ impl SchemaManager { "symbol_filename", "git_commits", "lore", + "indexed_branches", ]; // Add all content shard tables @@ -1170,6 +1226,7 @@ impl SchemaManager { "symbol_filename" => self.create_symbol_filename_table().await, "git_commits" => self.create_git_commits_table().await, "lore" => self.create_lore_table().await, + "indexed_branches" => self.create_indexed_branches_table().await, "content" => self.create_content_table().await, name if name.starts_with("content_") => { // Handle content shard tables From 62351d77419238e458bf722249e6cd0695f47ed0 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 10 Dec 2025 21:20:37 -0500 Subject: [PATCH 2/8] Add git branch operations for multi-branch support These functions enable branch resolution for queries and efficient incremental indexing across multiple branches. Signed-off-by: Sasha Levin --- src/git.rs | 245 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 245 insertions(+) diff --git a/src/git.rs b/src/git.rs index 78af454..d9fc8c6 100644 --- a/src/git.rs +++ b/src/git.rs @@ -243,6 +243,124 @@ mod tests { assert!(result.is_ok()); // May or may not be in a git repo depending on system } + + // ==================== Branch Operations Tests ==================== + + #[test] + fn test_get_current_branch() { + // This should work if running in a git repository + let result = get_current_branch("."); + assert!(result.is_ok()); + + // If we're on a branch (not detached HEAD), we should get a name + if let Ok(Some(branch)) = result { + assert!(!branch.is_empty()); + // Branch name should not contain refs/heads/ prefix + assert!(!branch.starts_with("refs/")); + } + } + + #[test] + fn test_list_branches_local() { + // List local branches only + let result = list_branches(".", false); + assert!(result.is_ok()); + + let branches = result.unwrap(); + // Should have at least one local branch (main/master) + assert!(!branches.is_empty()); + + // All should be local branches + for branch in &branches { + assert!(!branch.is_remote); + assert!(branch.remote.is_none()); + // Tip commit should be a valid SHA + assert_eq!(branch.tip_commit.len(), 40); + assert!(branch.tip_commit.chars().all(|c| c.is_ascii_hexdigit())); + } + } + + #[test] + fn test_list_branches_with_remote() { + // List all branches including remote + let result = list_branches(".", true); + assert!(result.is_ok()); + + let branches = result.unwrap(); + // Should have at least one branch + assert!(!branches.is_empty()); + + // Check that remote branches have proper attributes + for branch in &branches { + if branch.is_remote { + // Remote branches should have a remote name + assert!(branch.remote.is_some()); + // Name should contain remote prefix (e.g., "origin/main") + assert!(branch.name.contains('/')); + } + } + } + + #[test] + fn test_resolve_branch_main() { + // Try to resolve 'main' or 'master' branch + let main_result = resolve_branch(".", "main"); + let master_result = resolve_branch(".", "master"); + + // At least one should succeed + let resolved = main_result.or(master_result); + assert!(resolved.is_ok()); + + let sha = resolved.unwrap(); + assert_eq!(sha.len(), 40); + assert!(sha.chars().all(|c| c.is_ascii_hexdigit())); + } + + #[test] + fn test_resolve_branch_head() { + // HEAD should always resolve + let result = resolve_branch(".", "HEAD"); + assert!(result.is_ok()); + + let sha = result.unwrap(); + assert_eq!(sha.len(), 40); + } + + #[test] + fn test_branch_exists() { + // HEAD always exists + let result = branch_exists(".", "HEAD"); + assert!(result.is_ok()); + assert!(result.unwrap()); + + // Nonsense branch should not exist + let result = branch_exists(".", "definitely-not-a-real-branch-12345"); + assert!(result.is_ok()); + assert!(!result.unwrap()); + } + + #[test] + fn test_find_merge_base_same_commit() { + // Merge base of HEAD with itself should be HEAD + let result = find_merge_base(".", "HEAD", "HEAD"); + assert!(result.is_ok()); + + let base = result.unwrap(); + let head = resolve_branch(".", "HEAD").unwrap(); + assert_eq!(base, head); + } + + #[test] + fn test_find_merge_base_with_parent() { + // Merge base of HEAD and HEAD~1 should be HEAD~1 + let result = find_merge_base(".", "HEAD", "HEAD~1"); + + // This might fail if the repo has only one commit + if let Ok(base) = result { + assert_eq!(base.len(), 40); + assert!(base.chars().all(|c| c.is_ascii_hexdigit())); + } + } } /// Get git hash for a specific file at a specific commit @@ -932,6 +1050,133 @@ fn generate_commit_diff_with_symbols( Ok((diff_output, all_symbols, changed_files)) } +// ==================== Branch Operations ==================== + +/// Information about a git branch +#[derive(Debug, Clone)] +pub struct BranchRef { + /// Branch name (short form, e.g., "main" or "origin/develop") + pub name: String, + /// Commit SHA at the tip of the branch + pub tip_commit: String, + /// Whether this is a remote-tracking branch + pub is_remote: bool, + /// Remote name if this is a remote-tracking branch + pub remote: Option, +} + +/// Resolve a branch name to its tip commit SHA +/// Supports both local branches (e.g., "main") and remote branches (e.g., "origin/develop") +pub fn resolve_branch>(repo_path: P, branch_name: &str) -> Result { + let repo = gix::discover(repo_path)?; + + // Try to find as a local branch first + let local_ref = format!("refs/heads/{}", branch_name); + if let Ok(reference) = repo.find_reference(&local_ref) { + if let Ok(commit) = reference.into_fully_peeled_id() { + return Ok(commit.to_string()); + } + } + + // Try to find as a remote branch (e.g., "origin/main" -> "refs/remotes/origin/main") + let remote_ref = format!("refs/remotes/{}", branch_name); + if let Ok(reference) = repo.find_reference(&remote_ref) { + if let Ok(commit) = reference.into_fully_peeled_id() { + return Ok(commit.to_string()); + } + } + + // Try using resolve_to_commit as a fallback (handles tags and SHAs) + let commit = resolve_to_commit(&repo, branch_name)?; + Ok(commit.id().to_string()) +} + +/// List all branches in the repository +/// If include_remote is true, also includes remote-tracking branches +pub fn list_branches>(repo_path: P, include_remote: bool) -> Result> { + let repo = gix::discover(repo_path)?; + let mut branches = Vec::new(); + + // Get all references + let refs = repo.references()?; + + // Process local branches + for reference in refs.local_branches()?.flatten() { + let name = reference.name().shorten().to_string(); + + if let Ok(commit) = reference.into_fully_peeled_id() { + branches.push(BranchRef { + name, + tip_commit: commit.to_string(), + is_remote: false, + remote: None, + }); + } + } + + // Process remote branches if requested + if include_remote { + let refs = repo.references()?; + for reference in refs.remote_branches()?.flatten() { + let full_name = reference.name().shorten().to_string(); + + // Extract remote name from the branch name (e.g., "origin/main" -> "origin") + let remote = full_name.split('/').next().map(String::from); + + if let Ok(commit) = reference.into_fully_peeled_id() { + branches.push(BranchRef { + name: full_name, + tip_commit: commit.to_string(), + is_remote: true, + remote, + }); + } + } + } + + // Sort by name for consistent output + branches.sort_by(|a, b| a.name.cmp(&b.name)); + + Ok(branches) +} + +/// Find the merge base (common ancestor) between two refs +/// This is useful for finding the optimal starting point for incremental indexing +pub fn find_merge_base>(repo_path: P, ref_a: &str, ref_b: &str) -> Result { + let repo = gix::discover(repo_path)?; + + let commit_a = resolve_to_commit(&repo, ref_a)?; + let commit_b = resolve_to_commit(&repo, ref_b)?; + + // Use gix's merge_base method to find the common ancestor + let base = repo.merge_base(commit_a.id, commit_b.id)?; + + Ok(base.to_string()) +} + +/// Get the current branch name (if on a branch, not detached HEAD) +pub fn get_current_branch>(repo_path: P) -> Result> { + let repo = gix::discover(repo_path)?; + + match repo.head_name()? { + Some(name) => { + let short_name = name.shorten().to_string(); + Ok(Some(short_name)) + } + None => Ok(None), // Detached HEAD + } +} + +/// Check if a branch exists in the repository +pub fn branch_exists>(repo_path: P, branch_name: &str) -> Result { + match resolve_branch(repo_path, branch_name) { + Ok(_) => Ok(true), + Err(_) => Ok(false), + } +} + +// ==================== End Branch Operations ==================== + /// Write a unified diff and extract symbols from changed lines /// This is the same logic used during indexing pub fn write_diff_and_extract_symbols( From 256848984a93e5a6ee90ece592ea74034a5234d5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 10 Dec 2025 21:24:14 -0500 Subject: [PATCH 3/8] Add multi-branch indexing support to semcode-index - Add CLI flags for branch indexing: - --branch BRANCH (can be repeated) - --branches LIST (comma-separated) - --all-branches (index all local branches) - --remote-branches (include remote branches) - --update-branches (skip already-current branches) Signed-off-by: Sasha Levin --- src/bin/index.rs | 240 +++++++++++++++++++++++++++++ src/git.rs | 47 ++++++ src/git_range.rs | 394 +++++++++++++++++++++++++++++++++++++++++++++++ src/indexer.rs | 56 +++---- 4 files changed, 711 insertions(+), 26 deletions(-) diff --git a/src/bin/index.rs b/src/bin/index.rs index a163b1c..dc0dde0 100644 --- a/src/bin/index.rs +++ b/src/bin/index.rs @@ -103,6 +103,30 @@ struct Args { /// Clone and index a lore.kernel.org archive into /lore/ #[arg(long, value_name = "URL")] lore: Option, + + // ==================== Multi-Branch Indexing ==================== + /// Index a specific branch (can be specified multiple times) + /// Example: --branch main --branch develop + #[arg(long, value_name = "BRANCH")] + branch: Vec, + + /// Comma-separated list of branches to index + /// Example: --branches main,develop,feature-x + #[arg(long, value_name = "LIST", value_delimiter = ',')] + branches: Vec, + + /// Index all local branches + #[arg(long)] + all_branches: bool, + + /// Include remote-tracking branches when using --all-branches + #[arg(long)] + remote_branches: bool, + + /// Only index branches that have new commits since last indexed + /// (skip branches already indexed at current tip) + #[arg(long)] + update_branches: bool, } /// Fetch and parse the lore.kernel.org manifest @@ -401,6 +425,215 @@ async fn clone_lore_repository(lore_url: &str, db_path: &str) -> Result Ok(clone_path) } +// ==================== Branch Indexing Support ==================== + +/// Collect branches to index from the various branch-related CLI flags +/// Returns BranchRef structs with proper is_remote/remote metadata +fn collect_branches_to_index(args: &Args) -> Result> { + use semcode::git::{get_branch_info, list_branches, BranchRef}; + + let mut branch_names = Vec::new(); + + // Collect branch names from --branch flags (can be repeated) + branch_names.extend(args.branch.iter().cloned()); + + // Collect from --branches (comma-separated) + branch_names.extend(args.branches.iter().cloned()); + + // Remove duplicates while preserving order + let mut seen = HashSet::new(); + branch_names.retain(|b| seen.insert(b.clone())); + + // Convert manually-specified branch names to BranchRef with proper metadata + let mut branches: Vec = Vec::new(); + for name in &branch_names { + match get_branch_info(&args.source, name) { + Ok(branch_ref) => branches.push(branch_ref), + Err(e) => return Err(anyhow::anyhow!("Branch '{}' does not exist: {}", name, e)), + } + } + + // If --all-branches is set, get all branches from git + if args.all_branches { + let git_branches = list_branches(&args.source, args.remote_branches)?; + for branch in git_branches { + // Skip if already added from --branch/--branches + if !branches.iter().any(|b| b.name == branch.name) { + branches.push(branch); + } + } + } + + Ok(branches) +} + +/// Run branch indexing mode - index multiple branches +async fn run_branch_indexing(args: Args, branches: Vec) -> Result<()> { + info!("Starting multi-branch indexing mode"); + info!( + "Branches to index: {:?}", + branches.iter().map(|b| &b.name).collect::>() + ); + + // Process database path + let database_path = process_database_path(args.database.as_deref(), Some(&args.source)); + + // Create database manager wrapped in Arc for efficient sharing with process_git_range + // (Arc::clone is cheap - just increments ref count, reuses same LanceDB connection) + let db_manager = Arc::new( + DatabaseManager::new(&database_path, args.source.to_string_lossy().to_string()).await?, + ); + db_manager.create_tables().await?; + + if args.clear { + println!("Clearing existing data..."); + db_manager.clear_all_data().await?; + println!("Existing data cleared."); + } + + let start_time = std::time::Instant::now(); + let mut branches_indexed = 0; + let mut branches_skipped = 0; + + for branch in &branches { + let branch_name = &branch.name; + let tip_commit = &branch.tip_commit; + + println!( + "\n{}", + format!("=== Processing branch: {} ===", branch_name).cyan() + ); + + info!("Branch {} at commit {}", branch_name, &tip_commit[..8]); + + // Check if branch is already indexed at current tip (if --update-branches) + if args.update_branches + && db_manager + .is_branch_current(branch_name, tip_commit) + .await? + { + println!( + " {} Branch already indexed at current tip, skipping", + "→".yellow() + ); + branches_skipped += 1; + continue; + } + + // Get extensions for this indexing operation + let extensions: Vec = args + .extensions + .split(',') + .map(|s| s.trim().to_string()) + .collect(); + + // Check if this is initial or incremental indexing + if let Some(indexed_tip) = db_manager.get_branch_tip(branch_name).await? { + // Incremental indexing: commits from last indexed tip to current tip + let range = format!("{}..{}", indexed_tip, tip_commit); + info!("Incremental indexing: {} for branch {}", range, branch_name); + + println!(" {} Processing range: {}", "→".blue(), range); + + // Get commit count for this range + let repo = gix::discover(&args.source) + .map_err(|e| anyhow::anyhow!("Not in a git repository: {}", e))?; + + let commit_shas = match list_shas_in_range(&repo, &range) { + Ok(shas) => shas, + Err(e) => { + warn!("Failed to get commits for {}: {}", range, e); + vec![] + } + }; + + if commit_shas.is_empty() { + println!(" {} No new commits to index", "✓".green()); + } else { + println!( + " {} Found {} commits to process", + "→".blue(), + commit_shas.len() + ); + + semcode::git_range::process_git_range( + &args.source, + &range, + &extensions, + db_manager.clone(), + args.no_macros, + args.db_threads, + ) + .await?; + } + } else { + // Initial indexing: index the tree snapshot at the tip commit + // This is MUCH faster than walking all commits (e.g., 80k files vs 1.4M commits for Linux) + info!( + "Initial indexing for branch {} (tree at {})", + branch_name, + &tip_commit[..8] + ); + + println!(" {} Processing tree at {}", "→".blue(), &tip_commit[..8]); + + semcode::git_range::process_git_tree( + &args.source, + tip_commit, + &extensions, + db_manager.clone(), + args.no_macros, + args.db_threads, + ) + .await?; + } + + // Record that this branch has been indexed at the current tip + // Use the proper remote info from BranchRef (not a naive string split) + db_manager + .record_branch_indexed(branch_name, tip_commit, branch.remote.as_deref()) + .await?; + + println!(" {} Branch indexed successfully", "✓".green()); + branches_indexed += 1; + } + + let total_time = start_time.elapsed(); + + println!( + "\n{}", + "=== Multi-Branch Indexing Complete ===".green().bold() + ); + println!("Total time: {:.1}s", total_time.as_secs_f64()); + println!("Branches indexed: {}", branches_indexed); + if branches_skipped > 0 { + println!("Branches skipped (already current): {}", branches_skipped); + } + + // List all indexed branches + let indexed = db_manager.list_indexed_branches().await?; + if !indexed.is_empty() { + println!("\nIndexed branches:"); + for branch in indexed { + println!( + " {} → {} (indexed {})", + branch.branch_name.cyan(), + &branch.tip_commit[..8], + chrono::DateTime::from_timestamp(branch.indexed_at, 0) + .map(|dt| dt.format("%Y-%m-%d %H:%M").to_string()) + .unwrap_or_else(|| "unknown".to_string()) + ); + } + } + + println!("\nTo query this database, run:"); + println!(" semcode --database {}", database_path); + + Ok(()) +} + +// ==================== End Branch Indexing Support ==================== + #[tokio::main] async fn main() -> Result<()> { // Suppress ORT verbose logging @@ -642,6 +875,13 @@ async fn main() -> Result<()> { )); } + // Check if branch indexing mode is requested + let branches_to_index = collect_branches_to_index(&args)?; + if !branches_to_index.is_empty() { + info!("Branch indexing mode: {} branches", branches_to_index.len()); + return run_branch_indexing(args, branches_to_index).await; + } + info!("Starting semantic code indexing"); if let Some(ref git_range) = args.git { info!("Git commit indexing mode: {}", git_range); diff --git a/src/git.rs b/src/git.rs index d9fc8c6..5a1908f 100644 --- a/src/git.rs +++ b/src/git.rs @@ -1175,6 +1175,53 @@ pub fn branch_exists>(repo_path: P, branch_name: &str) -> Result< } } +/// Get detailed branch information for a specific branch name +/// This properly determines if a branch is local or remote-tracking +pub fn get_branch_info>(repo_path: P, branch_name: &str) -> Result { + let repo = gix::discover(repo_path.as_ref())?; + + // First, try to resolve as a local branch + let local_ref_name = format!("refs/heads/{}", branch_name); + if let Ok(reference) = repo.find_reference(&local_ref_name) { + if let Ok(commit) = reference.into_fully_peeled_id() { + return Ok(BranchRef { + name: branch_name.to_string(), + tip_commit: commit.to_string(), + is_remote: false, + remote: None, + }); + } + } + + // Try to resolve as a remote-tracking branch + let refs = repo.references()?; + for reference in refs.remote_branches()?.flatten() { + let full_name = reference.name().shorten().to_string(); + if full_name == branch_name { + // Extract remote name from the branch name (e.g., "origin/main" -> "origin") + let remote = branch_name.split('/').next().map(String::from); + + if let Ok(commit) = reference.into_fully_peeled_id() { + return Ok(BranchRef { + name: branch_name.to_string(), + tip_commit: commit.to_string(), + is_remote: true, + remote, + }); + } + } + } + + // Fall back to resolve_branch for the commit, assume local if we get here + let tip_commit = resolve_branch(repo_path.as_ref(), branch_name)?; + Ok(BranchRef { + name: branch_name.to_string(), + tip_commit, + is_remote: false, + remote: None, + }) +} + // ==================== End Branch Operations ==================== /// Write a unified diff and extract symbols from changed lines diff --git a/src/git_range.rs b/src/git_range.rs index 06a0aae..c79da3d 100644 --- a/src/git_range.rs +++ b/src/git_range.rs @@ -706,6 +706,400 @@ async fn process_git_tuples_streaming(config: StreamingConfig) -> Result, + source_root: PathBuf, + no_macros: bool, + processed_files: Arc>, + num_workers: usize, + db_manager: Arc, + num_inserters: usize, +} + +/// Stream git file tuples from a tree at a specific commit (producer for tree-based indexing) +fn stream_tree_file_tuples( + repo_path: PathBuf, + commit_sha: String, + extensions: Vec, + tuple_tx: mpsc::Sender, + processed_files: Arc>, +) -> Result { + use crate::git::walk_tree_at_commit; + + let mut sent_files = 0; + let mut filtered_already_processed = 0; + + // Walk the tree at the commit and send tuples for matching files + walk_tree_at_commit(&repo_path, &commit_sha, |relative_path, object_id| { + // Check if file has one of the target extensions + let path = std::path::Path::new(relative_path); + let ext_matches = path + .extension() + .map(|ext| extensions.contains(&ext.to_string_lossy().to_string())) + .unwrap_or(false); + + if !ext_matches { + return Ok(()); + } + + let file_sha = object_id.to_string(); + + // Filter out files already processed in database + if processed_files.contains(&file_sha) { + filtered_already_processed += 1; + return Ok(()); + } + + let tuple = GitFileTuple { + file_path: PathBuf::from(relative_path), + file_sha, + object_id: *object_id, + }; + + // Send tuple to channel - if channel is closed, workers are done + if tuple_tx.send(tuple).is_ok() { + sent_files += 1; + } + + Ok(()) + })?; + + tracing::info!( + "Tree walk complete: {} files sent, {} filtered (already processed)", + sent_files, + filtered_already_processed + ); + + Ok(sent_files) +} + +/// Process git tree at a specific commit using streaming pipeline +/// Unlike process_git_range which walks commits, this walks the tree snapshot +async fn process_git_tree_streaming(config: TreeStreamingConfig) -> Result { + use std::sync::Mutex; + + // Create progress bar for file processing + let pb = ProgressBar::new_spinner(); + pb.set_style( + ProgressStyle::with_template( + "{spinner:.green} [{elapsed_precise}] Processing tree: {pos} files processed - {msg}", + ) + .unwrap() + .progress_chars("⠁⠂⠄⡀⢀⠠⠐⠈ "), + ); + pb.set_message(format!("{} workers", config.num_workers)); + + // Create channels with backpressure + let (tuple_tx, tuple_rx) = mpsc::channel::(); + let result_channel_size = (config.num_workers * 2).clamp(4, 64); + let (result_tx, result_rx) = mpsc::sync_channel::(result_channel_size); + + // Wrap receivers for shared access + let shared_tuple_rx = Arc::new(Mutex::new(tuple_rx)); + let shared_result_rx = Arc::new(Mutex::new(result_rx)); + + // Shared progress counters + let processed_count = Arc::new(AtomicUsize::new(0)); + let inserted_functions = Arc::new(AtomicUsize::new(0)); + let inserted_types = Arc::new(AtomicUsize::new(0)); + let batches_sent = Arc::new(AtomicUsize::new(0)); + let batches_inserted = Arc::new(AtomicUsize::new(0)); + + // Spawn progress updater thread + let pb_clone = pb.clone(); + let processed_clone = processed_count.clone(); + let functions_clone = inserted_functions.clone(); + let types_clone = inserted_types.clone(); + let batches_sent_clone = batches_sent.clone(); + let batches_inserted_clone = batches_inserted.clone(); + let progress_thread = std::thread::spawn(move || loop { + let files = processed_clone.load(Ordering::Relaxed); + let funcs = functions_clone.load(Ordering::Relaxed); + let types = types_clone.load(Ordering::Relaxed); + let sent = batches_sent_clone.load(Ordering::Relaxed); + let inserted = batches_inserted_clone.load(Ordering::Relaxed); + let pending = sent.saturating_sub(inserted); + + pb_clone.set_position(files as u64); + pb_clone.set_message(format!( + "{} funcs, {} types | {} batches pending", + funcs, types, pending + )); + + if pb_clone.is_finished() { + break; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + }); + + // Spawn single generator thread that walks the tree + let generator_repo_path = config.repo_path.clone(); + let generator_commit_sha = config.commit_sha.clone(); + let generator_extensions = config.extensions.clone(); + let generator_processed_files = config.processed_files.clone(); + + let generator_handle = thread::spawn(move || { + if let Err(e) = stream_tree_file_tuples( + generator_repo_path, + generator_commit_sha, + generator_extensions, + tuple_tx, + generator_processed_files, + ) { + tracing::error!("Tree generator failed: {}", e); + } + }); + + // Use a mutex as a simple semaphore to control worker batch filling + let accumulate_lock = Arc::new(std::sync::Mutex::new(())); + + // Spawn worker threads (reuse existing tuple_worker_shared) + let mut worker_handles = Vec::new(); + for worker_id in 0..config.num_workers { + let ctx = TupleWorkerContext { + worker_id, + shared_tuple_rx: shared_tuple_rx.clone(), + result_tx: result_tx.clone(), + repo_path: config.repo_path.clone(), + source_root: config.source_root.clone(), + no_macros: config.no_macros, + processed_count: processed_count.clone(), + batches_sent: batches_sent.clone(), + accumulate_lock: accumulate_lock.clone(), + }; + + let handle = thread::spawn(move || { + tuple_worker_shared(ctx); + }); + worker_handles.push(handle); + } + + // Close the original result sender (workers have clones) + drop(result_tx); + + // Spawn database inserter tasks + let mut inserter_handles = Vec::new(); + let last_optimization_check = Arc::new(std::sync::Mutex::new(std::time::Instant::now())); + + for inserter_id in 0..config.num_inserters { + let db_manager_clone = Arc::clone(&config.db_manager); + let result_rx_clone = shared_result_rx.clone(); + let functions_counter = inserted_functions.clone(); + let types_counter = inserted_types.clone(); + let batches_inserted_counter = batches_inserted.clone(); + let optimization_check_timer = last_optimization_check.clone(); + + let handle = tokio::spawn(async move { + loop { + let batch = { + let rx = result_rx_clone.lock().unwrap(); + rx.recv() + }; + + match batch { + Ok(batch) => { + let func_count = batch.functions.len(); + let type_count = batch.types.len(); + + let (func_result, type_result, processed_files_result) = tokio::join!( + async { + if !batch.functions.is_empty() { + db_manager_clone.insert_functions(batch.functions).await + } else { + Ok(()) + } + }, + async { + if !batch.types.is_empty() { + db_manager_clone.insert_types(batch.types).await + } else { + Ok(()) + } + }, + async { + if !batch.processed_files.is_empty() { + db_manager_clone + .mark_files_processed(batch.processed_files) + .await + } else { + Ok(()) + } + } + ); + + let mut insertion_successful = true; + if let Err(e) = func_result { + error!("Inserter {} failed to insert functions: {}", inserter_id, e); + insertion_successful = false; + } else { + functions_counter.fetch_add(func_count, Ordering::Relaxed); + } + if let Err(e) = type_result { + error!("Inserter {} failed to insert types: {}", inserter_id, e); + insertion_successful = false; + } else { + types_counter.fetch_add(type_count, Ordering::Relaxed); + } + if let Err(e) = processed_files_result { + error!( + "Inserter {} failed to insert processed_files: {}", + inserter_id, e + ); + insertion_successful = false; + } + + if insertion_successful { + let total_batches = + batches_inserted_counter.fetch_add(1, Ordering::Relaxed) + 1; + + crate::indexer::check_and_optimize_if_needed( + &db_manager_clone, + inserter_id, + total_batches, + &optimization_check_timer, + ) + .await; + } + } + Err(_) => break, + } + } + }); + + inserter_handles.push(handle); + } + + // Wait for generator to complete + if let Err(e) = generator_handle.join() { + tracing::error!("Tree generator thread panicked: {:?}", e); + } + + // Wait for all workers to complete + for (worker_id, handle) in worker_handles.into_iter().enumerate() { + if let Err(e) = handle.join() { + tracing::error!("Worker {} thread panicked: {:?}", worker_id, e); + } + } + + // Wait for all inserter tasks to finish + for (inserter_id, handle) in inserter_handles.into_iter().enumerate() { + if let Err(e) = handle.await { + tracing::error!("Inserter {} task failed: {:?}", inserter_id, e); + } + } + + // Collect final statistics + let stats = GitTupleStats { + files_processed: processed_count.load(Ordering::Relaxed), + functions_count: inserted_functions.load(Ordering::Relaxed), + types_count: inserted_types.load(Ordering::Relaxed), + }; + + pb.finish_with_message(format!( + "Complete: {} files, {} functions, {} types", + stats.files_processed, stats.functions_count, stats.types_count + )); + progress_thread.join().unwrap(); + + Ok(stats) +} + +/// Process git tree at a specific commit (snapshot-based indexing) +/// This indexes all files at the commit without walking commit history. +/// Use this for initial branch indexing instead of process_git_range. +pub async fn process_git_tree( + repo_path: &std::path::Path, + commit_sha: &str, + extensions: &[String], + db_manager: Arc, + no_macros: bool, + db_threads: usize, +) -> Result<()> { + info!( + "Processing git tree at {} using streaming pipeline", + &commit_sha[..8.min(commit_sha.len())] + ); + + let start_time = std::time::Instant::now(); + + // Get already processed files from database for deduplication + info!("Loading processed files from database for deduplication"); + let processed_files_records = db_manager.get_all_processed_files().await?; + let processed_files: HashSet = processed_files_records + .into_iter() + .map(|record| record.git_file_sha) + .collect(); + + info!( + "Found {} already processed files in database", + processed_files.len() + ); + let processed_files = Arc::new(processed_files); + + // Determine number of workers + let num_workers = num_cpus::get().max(1); + info!( + "Starting tree streaming pipeline with {} worker threads", + num_workers + ); + + // Process tree using streaming pipeline + let processing_start = std::time::Instant::now(); + let config = TreeStreamingConfig { + repo_path: repo_path.to_path_buf(), + commit_sha: commit_sha.to_string(), + extensions: extensions.to_vec(), + source_root: repo_path.to_path_buf(), + no_macros, + processed_files, + num_workers, + db_manager: db_manager.clone(), + num_inserters: db_threads, + }; + let stats = process_git_tree_streaming(config).await?; + + let processing_time = processing_start.elapsed(); + + info!( + "Tree pipeline completed in {:.1}s: {} files, {} functions, {} types", + processing_time.as_secs_f64(), + stats.files_processed, + stats.functions_count, + stats.types_count + ); + + let total_time = start_time.elapsed(); + + println!("\n=== Git Tree Indexing Complete ==="); + println!("Total time: {:.1}s", total_time.as_secs_f64()); + println!("Files processed: {}", stats.files_processed); + println!("Functions indexed: {}", stats.functions_count); + println!("Types indexed: {}", stats.types_count); + + // Check if optimization is needed + match db_manager.check_optimization_health().await { + Ok((needs_optimization, message)) => { + if needs_optimization { + println!("\n{}", message); + match db_manager.optimize_database().await { + Ok(_) => println!("Database optimization completed successfully"), + Err(e) => error!("Failed to optimize database: {}", e), + } + } else { + println!("\n{}", message); + } + } + Err(e) => { + error!("Failed to check database health: {}", e); + } + } + + Ok(()) +} + /// Parse tags from commit message (e.g., Signed-off-by:, Reported-by:, etc.) /// Process git range using streaming file tuple pipeline /// This is the shared implementation used by semcode-index, query, and MCP tools diff --git a/src/indexer.rs b/src/indexer.rs index 266498f..58675bd 100644 --- a/src/indexer.rs +++ b/src/indexer.rs @@ -89,36 +89,40 @@ pub async fn check_and_optimize_if_needed( /// Parse git range and get all commit SHAs in the range /// Uses gitoxide's built-in rev-spec parsing for proper A..B semantics pub fn list_shas_in_range(repo: &gix::Repository, range: &str) -> Result> { - // For simplicity, let's just handle the common A..B case manually for now - // and use gitoxide's rev_walk properly - if !range.contains("..") { - return Err(anyhow::anyhow!( - "Only range format (A..B) is supported, got: '{}'", - range - )); - } - - // Parse A..B manually - let parts: Vec<&str> = range.split("..").collect(); - if parts.len() != 2 { - return Err(anyhow::anyhow!("Invalid range format '{}'", range)); - } - - let from_spec = parts[0]; - let to_spec = parts[1]; + // Support two formats: + // 1. "A..B" - commits reachable from B but not from A + // 2. "REF" (no ..) - all commits reachable from REF (for initial indexing) + + let (from_spec, to_spec) = if range.contains("..") { + let parts: Vec<&str> = range.split("..").collect(); + if parts.len() != 2 { + return Err(anyhow::anyhow!("Invalid range format '{}'", range)); + } + (parts[0], parts[1]) + } else { + // No range separator - return all commits up to this ref + ("", range) + }; - // Resolve the commit IDs - let from_commit = resolve_to_commit(repo, from_spec)?; + // Resolve the target commit let to_commit = resolve_to_commit(repo, to_spec)?; - let from_id = from_commit.id().detach(); let to_id = to_commit.id().detach(); - // Use rev_walk with proper include/exclude - let walk = repo - .rev_walk([to_id]) - .with_hidden([from_id]) - .sorting(Sorting::ByCommitTime(Default::default())) - .all()?; + // Build the rev_walk - optionally exclude ancestors of from_spec + let walk = if from_spec.is_empty() { + // No exclusion - include all ancestors + repo.rev_walk([to_id]) + .sorting(Sorting::ByCommitTime(Default::default())) + .all()? + } else { + // Exclude commits reachable from from_spec + let from_commit = resolve_to_commit(repo, from_spec)?; + let from_id = from_commit.id().detach(); + repo.rev_walk([to_id]) + .with_hidden([from_id]) + .sorting(Sorting::ByCommitTime(Default::default())) + .all()? + }; let mut shas = Vec::new(); let mut commit_count = 0; From 428afa92cafcd924c2466bf9ba52730b63f1ff5b Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 10 Dec 2025 21:27:59 -0500 Subject: [PATCH 4/8] Add --branch flag to query tool for branch-aware queries Adds the --branch CLI flag to semcode query tool allowing users to query code at a specific branch instead of the current HEAD. Signed-off-by: Sasha Levin --- src/bin/query.rs | 15 ++++++++++++- src/bin/query_impl/commands.rs | 40 ++++++++++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/src/bin/query.rs b/src/bin/query.rs index 25b5f46..67e2efb 100644 --- a/src/bin/query.rs +++ b/src/bin/query.rs @@ -27,6 +27,11 @@ struct Args { /// Path to local model directory (for semantic search) #[arg(long, value_name = "PATH")] model_path: Option, + + /// Query code at a specific branch instead of current HEAD + /// Example: --branch main + #[arg(long, value_name = "BRANCH")] + branch: Option, } /// Check if the current commit needs indexing and perform incremental indexing if needed @@ -192,7 +197,15 @@ async fn main() -> Result<()> { let parts: Vec<&str> = parts_owned.iter().map(|s| s.as_str()).collect(); // Handle command and check if we should exit - if handle_command(&parts, &db_manager, &args.git_repo, &args.model_path).await? { + if handle_command( + &parts, + &db_manager, + &args.git_repo, + &args.model_path, + &args.branch, + ) + .await? + { break; } } diff --git a/src/bin/query_impl/commands.rs b/src/bin/query_impl/commands.rs index 1f92fd2..26014ea 100644 --- a/src/bin/query_impl/commands.rs +++ b/src/bin/query_impl/commands.rs @@ -88,14 +88,45 @@ struct ShowCommitMetadataParams<'a> { /// Parse a potential git SHA from command arguments or default to current HEAD /// Returns (remaining_args, git_sha) -/// Now always returns a git SHA - either from --git flag, current HEAD, or a default -fn parse_git_sha<'a>(parts: &'a [&'a str], git_repo_path: &str) -> Result<(Vec<&'a str>, String)> { +/// Now always returns a git SHA - either from --git flag, target branch, current HEAD, or a default +fn parse_git_sha<'a>( + parts: &'a [&'a str], + git_repo_path: &str, + target_branch: &Option, +) -> Result<(Vec<&'a str>, String)> { if parts.len() >= 3 && parts[1] == "--git" { let git_sha = parts[2].to_string(); let remaining: Vec<&str> = [&parts[0..1], &parts[3..]].concat(); Ok((remaining, git_sha)) + } else if let Some(branch) = target_branch { + // Use the target branch to resolve to a specific commit + match git::resolve_branch(git_repo_path, branch) { + Ok(sha) => { + tracing::debug!( + "Using branch '{}' as git SHA: {}", + branch, + &sha[..8.min(sha.len())] + ); + Ok((parts.to_vec(), sha)) + } + Err(e) => { + tracing::warn!( + "Failed to resolve branch '{}': {}, falling back to HEAD", + branch, + e + ); + // Fall back to HEAD + match git::get_git_sha(git_repo_path) { + Ok(Some(head_sha)) => Ok((parts.to_vec(), head_sha)), + _ => Ok(( + parts.to_vec(), + "0000000000000000000000000000000000000000".to_string(), + )), + } + } + } } else { - // No --git flag provided, try to get current HEAD + // No --git flag or target branch provided, try to get current HEAD match git::get_git_sha(git_repo_path) { Ok(Some(head_sha)) => { tracing::debug!("Using current HEAD as default git SHA: {}", head_sha); @@ -344,6 +375,7 @@ pub async fn handle_command( db: &DatabaseManager, git_repo_path: &str, model_path: &Option, + target_branch: &Option, ) -> Result { // Handle commit command first (before parse_git_sha) since it uses --git differently if parts[0] == "commit" { @@ -512,7 +544,7 @@ pub async fn handle_command( } // Parse potential git SHA first (for all other commands) - let (parts, git_sha) = parse_git_sha(parts, git_repo_path)?; + let (parts, git_sha) = parse_git_sha(parts, git_repo_path, target_branch)?; match parts[0] { "quit" | "exit" | "q" => { From f41f75a57a8a83cd206687e7bdf6e31c6ccb0d84 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 10 Dec 2025 21:35:53 -0500 Subject: [PATCH 5/8] Add interactive branch commands to REPL Adds three new commands to the query REPL: - `branches` (or `br`): List indexed branches with their status - `branch`: Show current branch information and target branch - `compare`: Compare two branches showing merge base and indexing status Signed-off-by: Sasha Levin --- src/bin/query_impl/commands.rs | 265 +++++++++++++++++++++++++++++++++ src/display.rs | 16 ++ 2 files changed, 281 insertions(+) diff --git a/src/bin/query_impl/commands.rs b/src/bin/query_impl/commands.rs index 26014ea..1669d24 100644 --- a/src/bin/query_impl/commands.rs +++ b/src/bin/query_impl/commands.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT OR Apache-2.0 use anstream::stdout; use anyhow::Result; +use chrono::{TimeZone, Utc}; use colored::*; use regex; use semcode::{git, DatabaseManager, LoreEmailFilters}; @@ -1898,6 +1899,270 @@ pub async fn handle_command( } } } + "branches" | "br" => { + // List indexed branches + match db.list_indexed_branches().await { + Ok(branches) => { + if branches.is_empty() { + println!("{}", "No branches have been indexed yet.".yellow()); + println!("Use 'semcode-index --branch ' to index a branch."); + } else { + println!("{}", "Indexed Branches:".bold().cyan()); + println!("{:─<70}", "".bright_black()); + + // Get current git branch for comparison + let current_branch = git::get_current_branch(git_repo_path).ok().flatten(); + + for branch in &branches { + let is_current = current_branch.as_ref() == Some(&branch.branch_name); + let current_marker = if is_current { " (current)" } else { "" }; + let remote_info = branch + .remote + .as_ref() + .map(|r| format!(" [{}]", r)) + .unwrap_or_default(); + + // Check if branch is up-to-date with repo + let status = + match git::resolve_branch(git_repo_path, &branch.branch_name) { + Ok(current_tip) => { + if current_tip == branch.tip_commit { + "up-to-date".green().to_string() + } else { + "outdated".yellow().to_string() + } + } + Err(_) => "unknown".bright_black().to_string(), + }; + + println!( + " {} {}{}{}", + branch.branch_name.yellow(), + format!( + "({})", + &branch.tip_commit[..8.min(branch.tip_commit.len())] + ) + .bright_black(), + remote_info.cyan(), + current_marker.green() + ); + let indexed_time = Utc + .timestamp_opt(branch.indexed_at, 0) + .single() + .map(|dt| dt.format("%Y-%m-%d %H:%M:%S").to_string()) + .unwrap_or_else(|| "unknown".to_string()); + println!( + " Status: {} | Indexed: {}", + status, + indexed_time.bright_black() + ); + } + + println!("{:─<70}", "".bright_black()); + println!("Total: {} branch(es) indexed", branches.len()); + } + } + Err(e) => { + println!("{} Failed to list branches: {}", "Error:".red(), e); + } + } + } + "branch" => { + // Show current branch info + println!("{}", "Branch Information:".bold().cyan()); + println!("{:─<70}", "".bright_black()); + + // Show current git branch + match git::get_current_branch(git_repo_path) { + Ok(Some(branch)) => { + println!(" Current git branch: {}", branch.yellow()); + } + Ok(None) => { + println!( + " Current git branch: {} (detached HEAD)", + "none".bright_black() + ); + } + Err(e) => { + println!(" Current git branch: {} ({})", "unknown".red(), e); + } + } + + // Show current HEAD SHA + match git::get_git_sha(git_repo_path) { + Ok(Some(sha)) => { + println!( + " Current HEAD: {}", + sha[..12.min(sha.len())].bright_black() + ); + } + Ok(None) => { + println!(" Current HEAD: {}", "not in a git repo".bright_black()); + } + Err(e) => { + println!(" Current HEAD: {} ({})", "unknown".red(), e); + } + } + + // Show query target branch if set + if let Some(target) = target_branch { + println!(" Query target branch: {}", target.green()); + match git::resolve_branch(git_repo_path, target) { + Ok(sha) => { + println!(" Target SHA: {}", sha[..12.min(sha.len())].bright_black()); + } + Err(e) => { + println!(" Target SHA: {} ({})", "unknown".red(), e); + } + } + } else { + println!( + " Query target branch: {} (using HEAD)", + "none".bright_black() + ); + } + + println!("{:─<70}", "".bright_black()); + } + "compare" => { + // Compare branches + if parts.len() < 3 { + println!("{}", "Usage: compare ".red()); + println!(" Compare two branches and show their relationship"); + println!(" Example: compare main feature-branch"); + println!(" Example: compare origin/main develop"); + } else { + let branch1 = parts[1]; + let branch2 = parts[2]; + + // Resolve both branches to SHAs + let sha1 = match git::resolve_branch(git_repo_path, branch1) { + Ok(sha) => sha, + Err(e) => { + println!( + "{} Cannot resolve branch '{}': {}", + "Error:".red(), + branch1, + e + ); + return Ok(false); + } + }; + let sha2 = match git::resolve_branch(git_repo_path, branch2) { + Ok(sha) => sha, + Err(e) => { + println!( + "{} Cannot resolve branch '{}': {}", + "Error:".red(), + branch2, + e + ); + return Ok(false); + } + }; + + println!( + "{}", + format!("Branch Comparison: {} vs {}", branch1, branch2) + .bold() + .cyan() + ); + println!("{:─<70}", "".bright_black()); + + // Show branch tips + println!("\n{}", "Branch Tips:".bold()); + println!( + " {}: {}", + branch1.yellow(), + &sha1[..12.min(sha1.len())].bright_black() + ); + println!( + " {}: {}", + branch2.yellow(), + &sha2[..12.min(sha2.len())].bright_black() + ); + + // Try to find merge base + match git::find_merge_base(git_repo_path, &sha1, &sha2) { + Ok(merge_base) => { + println!("\n{}", "Merge Base:".bold()); + println!( + " {}", + &merge_base[..12.min(merge_base.len())].bright_black() + ); + + // Show which branch is ahead + if merge_base == sha1 { + println!( + "\n{}", + format!("{} is behind {}", branch1, branch2).yellow() + ); + } else if merge_base == sha2 { + println!( + "\n{}", + format!("{} is behind {}", branch2, branch1).yellow() + ); + } else { + println!("\n{}", "Branches have diverged from merge base".yellow()); + } + } + Err(e) => { + println!("\n{} Could not find merge base: {}", "Warning:".yellow(), e); + } + } + + // Check indexing status for both branches + println!("\n{}", "Indexing Status:".bold()); + match db.get_indexed_branch_info(branch1).await { + Ok(Some(info)) => { + let status = if info.tip_commit == sha1 { + "up-to-date".green().to_string() + } else { + "outdated".yellow().to_string() + }; + println!( + " {}: {} (indexed at {})", + branch1.yellow(), + status, + &info.tip_commit[..8.min(info.tip_commit.len())].bright_black() + ); + } + Ok(None) => { + println!(" {}: {}", branch1.yellow(), "not indexed".red()); + } + Err(_) => { + println!(" {}: {}", branch1.yellow(), "unknown".bright_black()); + } + } + match db.get_indexed_branch_info(branch2).await { + Ok(Some(info)) => { + let status = if info.tip_commit == sha2 { + "up-to-date".green().to_string() + } else { + "outdated".yellow().to_string() + }; + println!( + " {}: {} (indexed at {})", + branch2.yellow(), + status, + &info.tip_commit[..8.min(info.tip_commit.len())].bright_black() + ); + } + Ok(None) => { + println!(" {}: {}", branch2.yellow(), "not indexed".red()); + } + Err(_) => { + println!(" {}: {}", branch2.yellow(), "unknown".bright_black()); + } + } + + println!("\n{:─<70}", "".bright_black()); + println!( + "{}", + "Hint: Use --branch to query at a specific branch".bright_black() + ); + } + } _ => { println!( "{} Unknown command: '{}'. Type 'help' for available commands.", diff --git a/src/display.rs b/src/display.rs index 4ba93a5..b9f6e98 100644 --- a/src/display.rs +++ b/src/display.rs @@ -156,6 +156,22 @@ fn print_command_help() { "dig".yellow() ); + println!(); + println!("{}", "Branch Commands:".bold().cyan()); + println!( + " {} ({}) - List indexed branches", + "branches".yellow(), + "br".yellow() + ); + println!( + " {} - Show current branch information", + "branch".yellow() + ); + println!( + " {} - Compare two branches", + "compare".yellow() + ); + println!(); println!("{}", "General:".bold().cyan()); println!( From 4b2178c22d28d0839a50c9f5db8e07543c8b8a12 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 10 Dec 2025 21:42:36 -0500 Subject: [PATCH 6/8] Add branch parameter to MCP tools for branch-aware queries Adds a new `branch` parameter to MCP tools that support git_sha: - find_function, find_type, find_callers, find_calls - find_callchain, grep_functions, vgrep_functions When branch is provided, it's resolved to a SHA and used for the query. Branch takes precedence over git_sha if both are specified. Signed-off-by: Sasha Levin --- src/bin/semcode-mcp.rs | 85 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 12 deletions(-) diff --git a/src/bin/semcode-mcp.rs b/src/bin/semcode-mcp.rs index e49b1ec..da6bbd1 100644 --- a/src/bin/semcode-mcp.rs +++ b/src/bin/semcode-mcp.rs @@ -1987,6 +1987,7 @@ struct McpServer { db: Arc, default_git_sha: Option, model_path: Option, + git_repo_path: String, page_cache: PageCache, indexing_state: Arc>, notification_tx: Arc>>>, @@ -2022,6 +2023,7 @@ impl McpServer { db, default_git_sha, model_path, + git_repo_path: git_repo_path.to_string(), page_cache: PageCache::new(), indexing_state: Arc::new(tokio::sync::Mutex::new(IndexingState::new())), notification_tx: Arc::new(tokio::sync::Mutex::new(None)), @@ -2037,6 +2039,26 @@ impl McpServer { .unwrap_or_else(|| "0000000000000000000000000000000000000000".to_string()) } + /// Resolve git SHA from either git_sha or branch argument + /// If branch is provided, resolve it to a SHA. Otherwise use git_sha or default. + fn resolve_git_sha_or_branch( + &self, + git_sha_arg: Option<&str>, + branch_arg: Option<&str>, + ) -> String { + // Branch takes precedence if provided + if let Some(branch) = branch_arg { + match git::resolve_branch(&self.git_repo_path, branch) { + Ok(sha) => return sha, + Err(e) => { + eprintln!("Warning: Failed to resolve branch '{}': {}", branch, e); + // Fall through to git_sha or default + } + } + } + self.resolve_git_sha(git_sha_arg) + } + /// Check if the database appears to be empty and return a helpful message if so async fn check_database_status(&self) -> Option { let state = self.indexing_state.lock().await; @@ -2124,7 +2146,7 @@ impl McpServer { "tools": [ { "name": "find_function", - "description": "Find a function or macro by exact name, optionally at a specific git commit", + "description": "Find a function or macro by exact name, optionally at a specific git commit or branch", "inputSchema": { "type": "object", "properties": { @@ -2135,6 +2157,10 @@ impl McpServer { "git_sha": { "type": "string", "description": "Optional git commit SHA to search at (defaults to current HEAD)" + }, + "branch": { + "type": "string", + "description": "Optional branch name to search at (e.g., 'main', 'develop'). Takes precedence over git_sha if both are provided." } }, "required": ["name"] @@ -2142,7 +2168,7 @@ impl McpServer { }, { "name": "find_type", - "description": "Find a type, struct, union, or typedef by exact name, optionally at a specific git commit", + "description": "Find a type, struct, union, or typedef by exact name, optionally at a specific git commit or branch", "inputSchema": { "type": "object", "properties": { @@ -2153,6 +2179,10 @@ impl McpServer { "git_sha": { "type": "string", "description": "Optional git commit SHA to search at (defaults to current HEAD)" + }, + "branch": { + "type": "string", + "description": "Optional branch name to search at (e.g., 'main', 'develop'). Takes precedence over git_sha if both are provided." } }, "required": ["name"] @@ -2160,7 +2190,7 @@ impl McpServer { }, { "name": "find_callers", - "description": "Find all functions that call a specific function, optionally at a specific git commit", + "description": "Find all functions that call a specific function, optionally at a specific git commit or branch", "inputSchema": { "type": "object", "properties": { @@ -2171,6 +2201,10 @@ impl McpServer { "git_sha": { "type": "string", "description": "Optional git commit SHA to search at (defaults to current HEAD)" + }, + "branch": { + "type": "string", + "description": "Optional branch name to search at (e.g., 'main', 'develop'). Takes precedence over git_sha if both are provided." } }, "required": ["name"] @@ -2178,7 +2212,7 @@ impl McpServer { }, { "name": "find_calls", - "description": "Find all functions called by a specific function, optionally at a specific git commit", + "description": "Find all functions called by a specific function, optionally at a specific git commit or branch", "inputSchema": { "type": "object", "properties": { @@ -2189,6 +2223,10 @@ impl McpServer { "git_sha": { "type": "string", "description": "Optional git commit SHA to search at (defaults to current HEAD)" + }, + "branch": { + "type": "string", + "description": "Optional branch name to search at (e.g., 'main', 'develop'). Takes precedence over git_sha if both are provided." } }, "required": ["name"] @@ -2196,7 +2234,7 @@ impl McpServer { }, { "name": "find_callchain", - "description": "Show the complete call chain (both forward and reverse) for a function, optionally at a specific git commit", + "description": "Show the complete call chain (both forward and reverse) for a function, optionally at a specific git commit or branch", "inputSchema": { "type": "object", "properties": { @@ -2208,6 +2246,10 @@ impl McpServer { "type": "string", "description": "Optional git commit SHA to search at (defaults to current HEAD)" }, + "branch": { + "type": "string", + "description": "Optional branch name to search at (e.g., 'main', 'develop'). Takes precedence over git_sha if both are provided." + }, "up_levels": { "type": "integer", "description": "Number of caller levels to show (default: 2, 0 = no limit)", @@ -2263,6 +2305,10 @@ impl McpServer { "type": "string", "description": "Optional git commit SHA to search at (defaults to current HEAD)" }, + "branch": { + "type": "string", + "description": "Optional branch name to search at (e.g., 'main', 'develop'). Takes precedence over git_sha if both are provided." + }, "path_pattern": { "type": "string", "description": "Optional regex pattern to filter results by file path" @@ -2291,6 +2337,10 @@ impl McpServer { "type": "string", "description": "Optional git commit SHA to search at (defaults to current HEAD)" }, + "branch": { + "type": "string", + "description": "Optional branch name to search at (e.g., 'main', 'develop'). Takes precedence over git_sha if both are provided." + }, "path_pattern": { "type": "string", "description": "Optional regex pattern to filter results by file path" @@ -2688,7 +2738,8 @@ impl McpServer { let name = args["name"].as_str().unwrap_or(""); let git_sha_arg = args["git_sha"].as_str(); - let git_sha = self.resolve_git_sha(git_sha_arg); + let branch_arg = args["branch"].as_str(); + let git_sha = self.resolve_git_sha_or_branch(git_sha_arg, branch_arg); match mcp_query_function_or_macro(&self.db, name, &git_sha).await { Ok(output) => json!({ @@ -2711,7 +2762,8 @@ impl McpServer { let name = args["name"].as_str().unwrap_or(""); let git_sha_arg = args["git_sha"].as_str(); - let git_sha = self.resolve_git_sha(git_sha_arg); + let branch_arg = args["branch"].as_str(); + let git_sha = self.resolve_git_sha_or_branch(git_sha_arg, branch_arg); match mcp_query_type_or_typedef(&self.db, name, &git_sha).await { Ok(output) => json!({ @@ -2734,7 +2786,8 @@ impl McpServer { let name = args["name"].as_str().unwrap_or(""); let git_sha_arg = args["git_sha"].as_str(); - let git_sha = self.resolve_git_sha(git_sha_arg); + let branch_arg = args["branch"].as_str(); + let git_sha = self.resolve_git_sha_or_branch(git_sha_arg, branch_arg); match mcp_show_callers(&self.db, name, &git_sha).await { Ok(output) => json!({ @@ -2757,7 +2810,8 @@ impl McpServer { let name = args["name"].as_str().unwrap_or(""); let git_sha_arg = args["git_sha"].as_str(); - let git_sha = self.resolve_git_sha(git_sha_arg); + let branch_arg = args["branch"].as_str(); + let git_sha = self.resolve_git_sha_or_branch(git_sha_arg, branch_arg); match mcp_show_calls(&self.db, name, &git_sha).await { Ok(output) => json!({ @@ -2780,7 +2834,8 @@ impl McpServer { let name = args["name"].as_str().unwrap_or(""); let git_sha_arg = args["git_sha"].as_str(); - let git_sha = self.resolve_git_sha(git_sha_arg); + let branch_arg = args["branch"].as_str(); + let git_sha = self.resolve_git_sha_or_branch(git_sha_arg, branch_arg); // Parse the new parameters with same defaults as query tool let up_levels = args["up_levels"].as_u64().unwrap_or(2) as usize; @@ -2836,10 +2891,11 @@ impl McpServer { let pattern = args["pattern"].as_str().unwrap_or(""); let verbose = args["verbose"].as_bool().unwrap_or(false); let git_sha_arg = args["git_sha"].as_str(); + let branch_arg = args["branch"].as_str(); let path_pattern = args["path_pattern"].as_str(); let limit = args["limit"].as_u64().unwrap_or(100) as usize; - let git_sha = self.resolve_git_sha(git_sha_arg); + let git_sha = self.resolve_git_sha_or_branch(git_sha_arg, branch_arg); match mcp_grep_function_bodies(&self.db, pattern, verbose, path_pattern, limit, &git_sha) .await @@ -2864,10 +2920,11 @@ impl McpServer { let query_text = args["query_text"].as_str().unwrap_or(""); let git_sha_arg = args["git_sha"].as_str(); + let branch_arg = args["branch"].as_str(); let path_pattern = args["path_pattern"].as_str(); let limit = args["limit"].as_u64().unwrap_or(10) as usize; - let _git_sha = self.resolve_git_sha(git_sha_arg); + let _git_sha = self.resolve_git_sha_or_branch(git_sha_arg, branch_arg); match mcp_vgrep_similar_functions( &self.db, @@ -5291,6 +5348,7 @@ mod tests { db, default_git_sha: None, model_path: None, + git_repo_path: ".".to_string(), page_cache: PageCache::new(), indexing_state: Arc::new(tokio::sync::Mutex::new(IndexingState::new())), notification_tx: Arc::new(tokio::sync::Mutex::new(None)), @@ -5324,6 +5382,7 @@ mod tests { db, default_git_sha: None, model_path: None, + git_repo_path: ".".to_string(), page_cache: PageCache::new(), indexing_state: Arc::new(tokio::sync::Mutex::new(state)), notification_tx: Arc::new(tokio::sync::Mutex::new(None)), @@ -5359,6 +5418,7 @@ mod tests { db, default_git_sha: None, model_path: None, + git_repo_path: ".".to_string(), page_cache: PageCache::new(), indexing_state: Arc::new(tokio::sync::Mutex::new(state)), notification_tx: Arc::new(tokio::sync::Mutex::new(None)), @@ -5391,6 +5451,7 @@ mod tests { db, default_git_sha: None, model_path: None, + git_repo_path: ".".to_string(), page_cache: PageCache::new(), indexing_state: Arc::new(tokio::sync::Mutex::new(state)), notification_tx: Arc::new(tokio::sync::Mutex::new(None)), From abfa9bc875591c436bd63a53325884bfef8f49da Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 10 Dec 2025 21:44:30 -0500 Subject: [PATCH 7/8] Add list_branches and compare_branches MCP tools Adds two new MCP tools for branch management: - list_branches: Lists all indexed branches with their status - compare_branches: Compares two branches showing merge base, ahead/behind status, and indexing status Signed-off-by: Sasha Levin --- src/bin/semcode-mcp.rs | 194 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) diff --git a/src/bin/semcode-mcp.rs b/src/bin/semcode-mcp.rs index da6bbd1..d37b8a7 100644 --- a/src/bin/semcode-mcp.rs +++ b/src/bin/semcode-mcp.rs @@ -2696,6 +2696,32 @@ impl McpServer { "type": "object", "properties": {} } + }, + { + "name": "list_branches", + "description": "List all indexed branches with their status (up-to-date or outdated)", + "inputSchema": { + "type": "object", + "properties": {} + } + }, + { + "name": "compare_branches", + "description": "Compare two branches showing their relationship (merge base, which is ahead/behind)", + "inputSchema": { + "type": "object", + "properties": { + "branch1": { + "type": "string", + "description": "First branch name (e.g., 'main', 'develop')" + }, + "branch2": { + "type": "string", + "description": "Second branch name (e.g., 'feature-branch', 'origin/main')" + } + }, + "required": ["branch1", "branch2"] + } } ] }) @@ -2720,6 +2746,8 @@ impl McpServer { "dig" => self.handle_dig(arguments).await, "vlore_similar_emails" => self.handle_vlore_similar_emails(arguments).await, "indexing_status" => self.handle_indexing_status().await, + "list_branches" => self.handle_list_branches().await, + "compare_branches" => self.handle_compare_branches(arguments).await, _ => json!({ "error": format!("Unknown tool: {}", name), "isError": true @@ -3602,6 +3630,172 @@ impl McpServer { "content": [{"type": "text", "text": result}] }) } + + async fn handle_list_branches(&self) -> Value { + match self.db.list_indexed_branches().await { + Ok(branches) => { + let mut output = "=== Indexed Branches ===\n\n".to_string(); + + if branches.is_empty() { + output.push_str("No branches have been indexed yet.\n"); + output.push_str("Use 'semcode-index --branch ' to index a branch.\n"); + } else { + for branch in &branches { + // Check if branch is up-to-date + let status = + match git::resolve_branch(&self.git_repo_path, &branch.branch_name) { + Ok(current_tip) => { + if current_tip == branch.tip_commit { + "up-to-date" + } else { + "outdated" + } + } + Err(_) => "unknown", + }; + + let remote_info = branch + .remote + .as_ref() + .map(|r| format!(" [{}]", r)) + .unwrap_or_default(); + + output.push_str(&format!( + " {} ({}){}\n", + branch.branch_name, + &branch.tip_commit[..8.min(branch.tip_commit.len())], + remote_info + )); + output.push_str(&format!(" Status: {}\n\n", status)); + } + + output.push_str(&format!("Total: {} branch(es) indexed\n", branches.len())); + } + + json!({ + "content": [{"type": "text", "text": output}] + }) + } + Err(e) => json!({ + "error": format!("Failed to list branches: {}", e), + "isError": true + }), + } + } + + async fn handle_compare_branches(&self, args: &Value) -> Value { + let branch1 = args["branch1"].as_str().unwrap_or(""); + let branch2 = args["branch2"].as_str().unwrap_or(""); + + if branch1.is_empty() || branch2.is_empty() { + return json!({ + "error": "Both branch1 and branch2 are required", + "isError": true + }); + } + + // Resolve both branches to SHAs + let sha1 = match git::resolve_branch(&self.git_repo_path, branch1) { + Ok(sha) => sha, + Err(e) => { + return json!({ + "error": format!("Cannot resolve branch '{}': {}", branch1, e), + "isError": true + }); + } + }; + let sha2 = match git::resolve_branch(&self.git_repo_path, branch2) { + Ok(sha) => sha, + Err(e) => { + return json!({ + "error": format!("Cannot resolve branch '{}': {}", branch2, e), + "isError": true + }); + } + }; + + let mut output = format!("=== Branch Comparison: {} vs {} ===\n\n", branch1, branch2); + + // Show branch tips + output.push_str("Branch Tips:\n"); + output.push_str(&format!(" {}: {}\n", branch1, &sha1[..12.min(sha1.len())])); + output.push_str(&format!( + " {}: {}\n\n", + branch2, + &sha2[..12.min(sha2.len())] + )); + + // Try to find merge base + match git::find_merge_base(&self.git_repo_path, &sha1, &sha2) { + Ok(merge_base) => { + output.push_str(&format!( + "Merge Base: {}\n", + &merge_base[..12.min(merge_base.len())] + )); + + // Show which branch is ahead + if merge_base == sha1 { + output.push_str(&format!("\n{} is behind {}\n", branch1, branch2)); + } else if merge_base == sha2 { + output.push_str(&format!("\n{} is behind {}\n", branch2, branch1)); + } else { + output.push_str("\nBranches have diverged from merge base\n"); + } + } + Err(e) => { + output.push_str(&format!("Could not find merge base: {}\n", e)); + } + } + + // Check indexing status for both branches + output.push_str("\nIndexing Status:\n"); + match self.db.get_indexed_branch_info(branch1).await { + Ok(Some(info)) => { + let status = if info.tip_commit == sha1 { + "up-to-date" + } else { + "outdated" + }; + output.push_str(&format!( + " {}: {} (indexed at {})\n", + branch1, + status, + &info.tip_commit[..8.min(info.tip_commit.len())] + )); + } + Ok(None) => { + output.push_str(&format!(" {}: not indexed\n", branch1)); + } + Err(_) => { + output.push_str(&format!(" {}: unknown\n", branch1)); + } + } + match self.db.get_indexed_branch_info(branch2).await { + Ok(Some(info)) => { + let status = if info.tip_commit == sha2 { + "up-to-date" + } else { + "outdated" + }; + output.push_str(&format!( + " {}: {} (indexed at {})\n", + branch2, + status, + &info.tip_commit[..8.min(info.tip_commit.len())] + )); + } + Ok(None) => { + output.push_str(&format!(" {}: not indexed\n", branch2)); + } + Err(_) => { + output.push_str(&format!(" {}: unknown\n", branch2)); + } + } + + json!({ + "content": [{"type": "text", "text": output}] + }) + } } async fn mcp_diff_functions(diff_content: &str) -> Result { From 627963fd7bbc940ca95dcdb75daa8667043c42c5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sun, 14 Dec 2025 05:32:44 -0500 Subject: [PATCH 8/8] docs: add some branch docs Add branching-example.md explaining how to use semcode's multi-branch support for kernel stable tree maintenance. Signed-off-by: Sasha Levin --- docs/branching-example.md | 404 ++++++++++++++++++++++++++++++++++++++ docs/semcode-mcp.md | 19 +- 2 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 docs/branching-example.md diff --git a/docs/branching-example.md b/docs/branching-example.md new file mode 100644 index 0000000..99754e5 --- /dev/null +++ b/docs/branching-example.md @@ -0,0 +1,404 @@ +# Semcode Multi-Branch Querying Example + +This document demonstrates semcode's ability to query code across different +indexed branches, which is essential for tracking how code evolves across +kernel versions. + +## Why Multi-Branch Support? + +The Linux kernel maintains multiple stable branches simultaneously (5.10.y, +6.1.y, 6.6.y, 6.12.y, etc.), each receiving ongoing security fixes and +backports. When analyzing CVEs, backports, or API evolution, you often need +to query the same function across different kernel versions. + +**Primary use case: Backport decisions with autosel** + +When a fix lands in mainline, the stable team needs to determine: +- Which LTS branches are affected by this bug? +- Does the vulnerable code even exist in older branches? +- Will the fix apply cleanly, or has the code changed too much? + +Multi-branch queries answer these questions instantly, without manually +checking out each branch or running git-blame archaeology. + +Semcode's branch tracking helps you: +1. **Index once, query many** - Index multiple stable branches and switch + between them instantly without re-indexing +2. **Track branch freshness** - Know when a branch needs re-indexing because + new commits were pushed (e.g., a new 6.12.87 release) +3. **Compare versions** - Understand how code evolved between kernel versions +4. **Automate backport scope** - Programmatically determine which branches + need a fix by checking if the affected code exists + +## Listing Available Branches + +Use `list_branches` to see all indexed branches and their status: + +``` +=== Indexed Branches === + + origin/master (c9b47175) [origin] + Status: outdated + + stable/linux-5.10.y (f964b940) [stable] + Status: up-to-date + + stable/linux-6.1.y (50cbba13) [stable] + Status: up-to-date + + stable/linux-6.12.y (dcbeffaf) [stable] + Status: up-to-date + + stable/linux-6.17.y (5439375c) [stable] + Status: up-to-date + + stable/linux-6.18.y (7d0a66e4) [stable] + Status: up-to-date + +Total: 8 branch(es) indexed +``` + +### Understanding Branch Status + +Each branch shows a **status** field: + +- **up-to-date**: The indexed commit matches the current branch tip. Queries + reflect the latest code. +- **outdated**: The branch has received new commits since indexing. The + indexed SHA (shown in parentheses) is behind the current branch tip. + +When a branch is outdated, you can still query it - you'll just be querying +against the older indexed version. To update, re-run the indexer: + +```bash +# Re-index specific branches that are outdated +semcode-index -s . --branches stable/linux-6.12.y,stable/linux-6.17.y + +# Or re-index all configured branches +semcode-index -s . --branches stable/linux-5.10.y,stable/linux-6.1.y,stable/linux-6.12.y,stable/linux-6.17.y,stable/linux-6.18.y +``` + +### Typical Workflow: Keeping Stable Branches Current + +For kernel CVE analysis, you might set up a cron job or periodic task: + +```bash +# 1. Fetch latest stable branches +git fetch --all + +# 2. Check which branches need updating +semcode list_branches # Look for "outdated" status + +# 3. Re-index outdated branches +semcode-index -s . --branches stable/linux-6.12.y,stable/linux-6.18.y +``` + +## Comparing Branches + +Use `compare_branches` to understand the relationship between two branches: + +``` +=== Branch Comparison: stable/linux-6.1.y vs stable/linux-6.18.y === + +Branch Tips: + stable/linux-6.1.y: 50cbba13faa2 + stable/linux-6.18.y: 7d0a66e4bb90 + +Merge Base: 830b3c68c1fb + +Branches have diverged from merge base + +Indexing Status: + stable/linux-6.1.y: up-to-date (indexed at 50cbba13) + stable/linux-6.18.y: up-to-date (indexed at 7d0a66e4) +``` + +## Querying Functions Across Branches + +### Example: Tracking io_uring_setup Evolution + +The `branch` parameter works with most semcode tools. Here's how +`io_uring_setup` changed between kernel 5.10 and 6.18: + +**Linux 5.10 (stable/linux-5.10.y):** + +```c +// File: io_uring/io_uring.c:10311-10330 +static long io_uring_setup(u32 entries, struct io_uring_params __user *params) +{ + struct io_uring_params p; + int i; + + if (copy_from_user(&p, params, sizeof(p))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(p.resv); i++) { + if (p.resv[i]) + return -EINVAL; + } + + // Flags checked individually - only 7 flags supported + if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | + IORING_SETUP_R_DISABLED)) + return -EINVAL; + + return io_uring_create(entries, &p, params); +} +``` + +**Linux 6.18 (stable/linux-6.18.y):** + +```c +// File: io_uring/io_uring.c:3924-3939 +static long io_uring_setup(u32 entries, struct io_uring_params __user *params) +{ + struct io_uring_params p; + int i; + + if (copy_from_user(&p, params, sizeof(p))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(p.resv); i++) { + if (p.resv[i]) + return -EINVAL; + } + + // Consolidated into IORING_SETUP_FLAGS macro + if (p.flags & ~IORING_SETUP_FLAGS) + return -EINVAL; + return io_uring_create(entries, &p, params); +} +``` + +**Key differences observed:** +1. File location moved from line 10311 to line 3924 (major refactoring) +2. Individual flag checks consolidated into `IORING_SETUP_FLAGS` macro +3. Code is cleaner and more maintainable in 6.18 + +### Example: Code Organization Changes + +Using `grep_functions` with branch parameter shows how io_uring was +reorganized: + +**Linux 5.10:** All io_uring code in `io_uring/io_uring.c` +``` +io_uring/io_uring.c:io_poll_add_prep:5970 +io_uring/io_uring.c:io_sfr_prep:4683 +io_uring/io_uring.c:io_sq_thread:7562 +``` + +**Linux 6.18:** Code split across multiple files +``` +io_uring/register.c:io_register_restrictions:162 +io_uring/msg_ring.c:io_msg_send_fd:247 +io_uring/sqpoll.c:io_sq_offload_create:451 +io_uring/io_uring.h:io_lockdep_assert_cq_locked:186 +``` + +## Integration with Autosel Backport Workflow + +When autosel (or a human maintainer) evaluates whether to backport a commit to +stable branches, multi-branch queries answer critical questions: + +### Question 1: "How far back should we backport?" + +A fix in mainline might only apply to recent kernels if the vulnerable code +was introduced after older LTS branches diverged. + +```bash +# Does the vulnerable function exist in each stable branch? +find_function(name="vulnerable_func", branch="stable/linux-5.10.y") # Not found = no backport needed +find_function(name="vulnerable_func", branch="stable/linux-6.1.y") # Found = needs backport +find_function(name="vulnerable_func", branch="stable/linux-6.6.y") # Found = needs backport +``` + +**Real example**: The folio APIs don't exist in 5.10.y, so any fix involving +`folio_test_slab` or similar functions only needs backporting to 5.15+. + +### Question 2: "Is the fix applicable as-is, or does it need modification?" + +Code often changes between versions. A fix that cleanly applies to 6.18 might +need adaptation for 5.10. + +```bash +# Compare function signatures across versions +find_function(name="kfree", branch="stable/linux-5.10.y") # In mm/slab.c, SLAB allocator +find_function(name="kfree", branch="stable/linux-6.18.y") # In mm/slub.c, SLUB allocator + +# Check if the fix's context matches +grep_functions(pattern="specific_code_pattern", branch="stable/linux-5.10.y") +grep_functions(pattern="specific_code_pattern", branch="stable/linux-6.1.y") +``` + +If the pattern doesn't exist in 5.10.y, the fix either doesn't apply or needs +a different approach. + +### Question 3: "Is this a real issue in this LTS branch?" + +Sometimes a bug exists in mainline but was never present in older branches +(e.g., introduced by a feature that wasn't backported). + +```bash +# Check if the problematic code path exists +find_callers(name="problematic_func", branch="stable/linux-5.10.y") # 0 callers = not reachable +find_callers(name="problematic_func", branch="stable/linux-6.12.y") # 50 callers = affected + +# Verify the vulnerable pattern is present +grep_functions(pattern="if.*NULL.*&&.*ptr->field", branch="stable/linux-5.10.y") +``` + +### Question 4: "What's the blast radius of this change?" + +Understanding how many callers/callees are affected helps assess risk. + +```bash +# Compare caller counts - more callers = higher risk backport +find_callers(name="affected_func", branch="stable/linux-5.10.y") # 50 callers +find_callers(name="affected_func", branch="stable/linux-6.18.y") # 150 callers + +# The function grew significantly - the 6.18 fix might touch code paths +# that don't exist in 5.10.y +``` + +### Typical Autosel Workflow with Semcode + +``` +1. Commit appears in mainline: "Fix NULL deref in foo_handler()" + +2. Autosel asks: "Should this go to stable?" + → LLM says yes, it's a bug fix + +3. Autosel asks: "Which stable branches need this?" + → Query each branch: + - 5.10.y: foo_handler doesn't exist (introduced in 5.15) → SKIP + - 5.15.y: foo_handler exists, vulnerable pattern present → BACKPORT + - 6.1.y: foo_handler exists, vulnerable pattern present → BACKPORT + - 6.6.y: foo_handler exists, vulnerable pattern present → BACKPORT + - 6.12.y: foo_handler exists, vulnerable pattern present → BACKPORT + +4. Autosel asks: "Will the patch apply cleanly?" + → Compare function bodies between mainline and each target branch + → Flag branches where context differs significantly +``` + +## Use Cases + +### 1. CVE Analysis Across Stable Branches + +When analyzing a security fix, check which stable branches have the fix: + +``` +# Check if a function exists in different branches +find_function(name="vulnerable_func", branch="stable/linux-5.10.y") +find_function(name="vulnerable_func", branch="stable/linux-6.1.y") +``` + +### 2. Backport Verification + +Verify a backport was applied correctly: + +``` +# Compare function implementation across branches +find_function(name="fixed_function", branch="origin/master") +find_function(name="fixed_function", branch="stable/linux-6.1.y") +``` + +### 3. API Evolution Tracking + +Track how APIs change over kernel versions: + +``` +# Find callers to understand usage patterns +find_callers(name="old_api_function", branch="stable/linux-5.10.y") +find_callers(name="new_api_function", branch="stable/linux-6.18.y") +``` + +### 4. Regression Investigation + +When a regression appears in a stable branch, compare with mainline: + +``` +compare_branches(branch1="stable/linux-6.12.y", branch2="origin/master") +grep_functions(pattern="suspicious_pattern", branch="stable/linux-6.12.y") +grep_functions(pattern="suspicious_pattern", branch="origin/master") +``` + +## Indexing Multiple Branches + +To index multiple branches for querying: + +```bash +# Index specific branches +semcode-index -s . --branches main,stable/linux-6.1.y,stable/linux-6.12.y + +# Query tool with default branch +semcode --branch stable/linux-6.12.y +``` + +## MCP Tool Parameters + +All these MCP tools support the `branch` parameter: + +| Tool | Branch Support | +|------|----------------| +| `find_function` | Yes | +| `find_type` | Yes | +| `find_callers` | Yes | +| `find_calls` | Yes | +| `find_callchain` | Yes | +| `grep_functions` | Yes | +| `vgrep_functions` | Yes | +| `list_branches` | N/A (lists all) | +| `compare_branches` | Takes two branches | + +The `branch` parameter takes precedence over `git_sha` if both are provided. + +## Real-World Examples from Testing + +### API Evolution: Memory Allocator Changes + +The `kfree` function moved between allocators across kernel versions: + +| Version | File | Allocator | +|---------|------|-----------| +| 5.10.y | `mm/slab.c:3738` | SLAB | +| 6.18.y | `mm/slub.c:6829` | SLUB | + +```bash +# Query shows different implementations +find_function(name="kfree", branch="stable/linux-5.10.y") # SLAB version +find_function(name="kfree", branch="stable/linux-6.18.y") # SLUB version +``` + +### API Renaming: Profiling Annotations + +Some functions were renamed with `_noprof` suffixes for memory profiling: + +| 5.10.y | 6.18.y | +|--------|--------| +| `__kmalloc` | `__kmalloc_noprof` | + +This is why searching for `__kmalloc` in 6.18.y returns a bootloader stub +instead of the main allocator - the real function was renamed. + +### Codebase Growth + +Caller counts show kernel growth over time: + +| Function | 5.10.y Callers | 6.18.y Callers | Growth | +|----------|----------------|----------------|--------| +| `kvfree` | 797 | 1,476 | +85% | +| `kfree` | 24,731 | 28,294 | +14% | + +### New APIs + +Some APIs only exist in newer kernels: + +```bash +# folio_test_slab doesn't exist in 5.10 +grep_functions(pattern="folio_test_slab", branch="stable/linux-5.10.y") # 0 results +grep_functions(pattern="folio_test_slab", branch="stable/linux-6.18.y") # 5+ results +``` + +The `folio` abstraction was introduced after 5.10, so related APIs are +missing from older branches. diff --git a/docs/semcode-mcp.md b/docs/semcode-mcp.md index 97b13f1..a9597cf 100644 --- a/docs/semcode-mcp.md +++ b/docs/semcode-mcp.md @@ -1,12 +1,15 @@ # semcode usage guide All semcode functions are git aware and default to lookups on the current -commit. You can also pass a specific commit you're interested in. +commit. You can also pass a specific commit you're interested in, or a branch name. **Note on Regex Patterns**: All regex patterns in semcode are **case-insensitive by default**. This applies to all pattern matching including function names, commit messages, symbols, and lore email searches. You don't need to use the `(?i)` flag. +**Branch Support**: Most query tools support a `branch` parameter as an alternative to `git_sha`. When you specify a branch name (e.g., "main", "develop"), it will be resolved to the current tip commit of that branch. Branch takes precedence over git_sha if both are provided. + **find_function**: search for functions and macros - git_sha: indicates which commit to search (default: current) + - branch: branch name to search (alternative to git_sha, e.g., "main", "develop") - name: function/macro name, or a regex - also displays details on callers and callees **find_type**: search for types and typedefs @@ -74,6 +77,20 @@ commit. You can also pass a specific commit you're interested in. sha provided. Mutually exclusive with git_range - page: optional page number for pagination (1-based). Each page contains 50 lines, results indicate current page and total pages. Default: full results +**list_branches**: list all indexed branches with their status + - No parameters required + - Shows branch names, indexed commit SHAs, and freshness status + - **up-to-date**: indexed commit matches current branch tip + - **outdated**: branch has new commits since indexing (re-index to update) + - Useful for tracking multiple stable branches (e.g., linux-5.10.y, 6.1.y, 6.12.y) + and knowing when they need re-indexing after new releases +**compare_branches**: compare two branches and show their relationship + - branch1: first branch name (e.g., "main") + - branch2: second branch name (e.g., "feature-branch") + - Shows merge base, ahead/behind status, and indexing status for both branches +**indexing_status**: check the status of background indexing operation + - No parameters required + - Shows current indexing progress, errors, and timing ## Recipes