Add file processing logs and remove content storage from database (#4)

Copilot · Mte90 · web-flow · commit ba10c1bd395c · 2025-11-07T15:47:33.000+01:00
Co-authored-by: Mte90 &lt;403283+Mte90@users.noreply.github.com&gt;
Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
diff --git a/ai/analyzer.py b/ai/analyzer.py
@@ -255,16 +255,68 @@ def _search_vectors(database_path: str, q_vector: List[float], top_k: int = 5) -
 
 
 def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optional[str]:
+    """
+    Get chunk text by reading from filesystem instead of database.
+    Uses project_path metadata and file path to read the actual file.
+    """
     conn = _connect_db(database_path)
     try:
         cur = conn.cursor()
-        cur.execute("SELECT content FROM files WHERE id = ?", (file_id,))
+        # Get file path from database
+        cur.execute("SELECT path FROM files WHERE id = ?", (file_id,))
         row = cur.fetchone()
         if not row:
+            logger.warning(f"File not found in database: file_id={file_id}")
+            return None
+        
+        file_path = row[0]
+        if not file_path:
+            logger.warning(f"File path is empty for file_id={file_id}")
+            return None
+        
+        # Get project path from metadata
+        project_path = get_project_metadata(database_path, "project_path")
+        if not project_path:
+            logger.error("Project path not found in metadata, cannot read file from filesystem")
+            raise RuntimeError("Project path metadata is missing - ensure the indexing process has stored project metadata properly")
+        
+        # Construct full file path and resolve to absolute path
+        full_path = os.path.abspath(os.path.join(project_path, file_path))
+        normalized_project_path = os.path.abspath(project_path)
+        
+        # Security check: ensure the resolved path is within the project directory
+        try:
+            common = os.path.commonpath([full_path, normalized_project_path])
+            if common != normalized_project_path:
+                logger.error(f"Path traversal attempt detected: {file_path} resolves outside project directory")
+                return None
+            if full_path != normalized_project_path and not full_path.startswith(normalized_project_path + os.sep):
+                logger.error(f"Path traversal attempt detected: {file_path} does not start with project directory")
+                return None
+        except ValueError:
+            logger.error(f"Path traversal attempt detected: {file_path} is on a different drive or incompatible path")
+            return None
+        
+        # Read file content from filesystem
+        try:
+            with open(full_path, "r", encoding="utf-8", errors="replace") as fh:
+                content = fh.read()
+        except Exception as e:
+            logger.warning(f"Failed to read file from filesystem: {full_path}, error: {e}")
             return None
-        content = row[0] or ""
+        
+        if not content:
+            return None
+        
+        # Extract the chunk
         if CHUNK_SIZE <= 0:
             return content
+        
+        # Validate chunk_index
+        if chunk_index < 0:
+            logger.warning(f"Invalid chunk_index {chunk_index} for file_id={file_id}")
+            return None
+        
         step = max(1, CHUNK_SIZE - CHUNK_OVERLAP)
         start = chunk_index * step
         end = min(start + CHUNK_SIZE, len(content))
@@ -309,8 +361,12 @@ def _process_file_sync(
         
         # Check if file needs reindexing (incremental mode)
         if incremental and not needs_reindex(database_path, rel_path, mtime, file_hash):
+            logger.debug(f"Skipping unchanged file: {rel_path}")
             return {"stored": False, "embedded": False, "skipped": True}
 
+        # Log file processing
+        logger.info(f"Processing file: {rel_path}")
+
         # store file (synchronous DB writer) with metadata
         try:
             fid = store_file(database_path, rel_path, content, lang, mtime, file_hash)
@@ -426,16 +482,26 @@ def analyze_local_path_sync(
     Submits per-file tasks to a shared ThreadPoolExecutor.
     Supports incremental indexing to skip unchanged files.
     """
+    from db.operations import set_project_metadata
+    
     semaphore = threading.Semaphore(EMBEDDING_CONCURRENCY)
     start_time = time.time()
     
+    # Store project path in metadata for filesystem access
+    try:
+        set_project_metadata(database_path, "project_path", local_path)
+        logger.info(f"Starting indexing for project at: {local_path}")
+    except Exception as e:
+        logger.warning(f"Failed to store project path in metadata: {e}")
+    
     try:
         file_count = 0
         emb_count = 0
         skipped_count = 0
         file_paths: List[Dict[str, str]] = []
 
         # Collect files to process
+        logger.info("Collecting files to index...")
         for root, dirs, files in os.walk(local_path):
             for fname in files:
                 full = os.path.join(root, fname)
@@ -447,6 +513,8 @@ def analyze_local_path_sync(
                 except Exception:
                     continue
                 file_paths.append({"full": full, "rel": rel})
+        
+        logger.info(f"Found {len(file_paths)} files to process")
 
         # Process files in chunks to avoid too many futures at once.
         CHUNK_SUBMIT = 256
@@ -482,6 +550,9 @@ def analyze_local_path_sync(
         end_time = time.time()
         duration = end_time - start_time
         
+        # Log summary
+        logger.info(f"Indexing completed: {file_count} files processed, {emb_count} embeddings created, {skipped_count} files skipped in {duration:.2f}s")
+        
         try:
             # Use batch update for efficiency - single database transaction
             set_project_metadata_batch(database_path, {
diff --git a/db/operations.py b/db/operations.py
@@ -233,21 +233,22 @@ def store_file(database_path, path, content, language, last_modified=None, file_
     Insert or update a file record into the DB using a queued single-writer to avoid
     sqlite 'database is locked' errors in multithreaded scenarios.
     Supports incremental indexing with last_modified and file_hash tracking.
+    Note: Does not store full file content in database (only snippet), content is read from filesystem when needed.
+    The content parameter is still required to generate the snippet.
     Returns lastrowid (same as the previous store_file implementation).
     """
     snippet = (content[:512] if content else "")
     sql = """
-        INSERT INTO files (path, content, language, snippet, last_modified, file_hash, updated_at) 
-        VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
+        INSERT INTO files (path, language, snippet, last_modified, file_hash, updated_at) 
+        VALUES (?, ?, ?, ?, ?, datetime('now'))
         ON CONFLICT(path) DO UPDATE SET 
-            content=excluded.content,
             language=excluded.language,
             snippet=excluded.snippet,
             last_modified=excluded.last_modified,
             file_hash=excluded.file_hash,
             updated_at=datetime('now')
     """
-    params = (path, content, language, snippet, last_modified, file_hash)
+    params = (path, language, snippet, last_modified, file_hash)
 
     writer = _get_writer(database_path)
     # We wait for the background writer to complete the insert and then return the row id.