@@ -255,16 +255,68 @@ def _search_vectors(database_path: str, q_vector: List[float], top_k: int = 5) -
255255
256256
257257def _get_chunk_text (database_path : str , file_id : int , chunk_index : int ) -> Optional [str ]:
258+ """
259+ Get chunk text by reading from filesystem instead of database.
260+ Uses project_path metadata and file path to read the actual file.
261+ """
258262 conn = _connect_db (database_path )
259263 try :
260264 cur = conn .cursor ()
261- cur .execute ("SELECT content FROM files WHERE id = ?" , (file_id ,))
265+ # Get file path from database
266+ cur .execute ("SELECT path FROM files WHERE id = ?" , (file_id ,))
262267 row = cur .fetchone ()
263268 if not row :
269+ logger .warning (f"File not found in database: file_id={ file_id } " )
270+ return None
271+
272+ file_path = row [0 ]
273+ if not file_path :
274+ logger .warning (f"File path is empty for file_id={ file_id } " )
275+ return None
276+
277+ # Get project path from metadata
278+ project_path = get_project_metadata (database_path , "project_path" )
279+ if not project_path :
280+ logger .error ("Project path not found in metadata, cannot read file from filesystem" )
281+ raise RuntimeError ("Project path metadata is missing - ensure the indexing process has stored project metadata properly" )
282+
283+ # Construct full file path and resolve to absolute path
284+ full_path = os .path .abspath (os .path .join (project_path , file_path ))
285+ normalized_project_path = os .path .abspath (project_path )
286+
287+ # Security check: ensure the resolved path is within the project directory
288+ try :
289+ common = os .path .commonpath ([full_path , normalized_project_path ])
290+ if common != normalized_project_path :
291+ logger .error (f"Path traversal attempt detected: { file_path } resolves outside project directory" )
292+ return None
293+ if full_path != normalized_project_path and not full_path .startswith (normalized_project_path + os .sep ):
294+ logger .error (f"Path traversal attempt detected: { file_path } does not start with project directory" )
295+ return None
296+ except ValueError :
297+ logger .error (f"Path traversal attempt detected: { file_path } is on a different drive or incompatible path" )
298+ return None
299+
300+ # Read file content from filesystem
301+ try :
302+ with open (full_path , "r" , encoding = "utf-8" , errors = "replace" ) as fh :
303+ content = fh .read ()
304+ except Exception as e :
305+ logger .warning (f"Failed to read file from filesystem: { full_path } , error: { e } " )
264306 return None
265- content = row [0 ] or ""
307+
308+ if not content :
309+ return None
310+
311+ # Extract the chunk
266312 if CHUNK_SIZE <= 0 :
267313 return content
314+
315+ # Validate chunk_index
316+ if chunk_index < 0 :
317+ logger .warning (f"Invalid chunk_index { chunk_index } for file_id={ file_id } " )
318+ return None
319+
268320 step = max (1 , CHUNK_SIZE - CHUNK_OVERLAP )
269321 start = chunk_index * step
270322 end = min (start + CHUNK_SIZE , len (content ))
@@ -309,8 +361,12 @@ def _process_file_sync(
309361
310362 # Check if file needs reindexing (incremental mode)
311363 if incremental and not needs_reindex (database_path , rel_path , mtime , file_hash ):
364+ logger .debug (f"Skipping unchanged file: { rel_path } " )
312365 return {"stored" : False , "embedded" : False , "skipped" : True }
313366
367+ # Log file processing
368+ logger .info (f"Processing file: { rel_path } " )
369+
314370 # store file (synchronous DB writer) with metadata
315371 try :
316372 fid = store_file (database_path , rel_path , content , lang , mtime , file_hash )
@@ -426,16 +482,26 @@ def analyze_local_path_sync(
426482 Submits per-file tasks to a shared ThreadPoolExecutor.
427483 Supports incremental indexing to skip unchanged files.
428484 """
485+ from db .operations import set_project_metadata
486+
429487 semaphore = threading .Semaphore (EMBEDDING_CONCURRENCY )
430488 start_time = time .time ()
431489
490+ # Store project path in metadata for filesystem access
491+ try :
492+ set_project_metadata (database_path , "project_path" , local_path )
493+ logger .info (f"Starting indexing for project at: { local_path } " )
494+ except Exception as e :
495+ logger .warning (f"Failed to store project path in metadata: { e } " )
496+
432497 try :
433498 file_count = 0
434499 emb_count = 0
435500 skipped_count = 0
436501 file_paths : List [Dict [str , str ]] = []
437502
438503 # Collect files to process
504+ logger .info ("Collecting files to index..." )
439505 for root , dirs , files in os .walk (local_path ):
440506 for fname in files :
441507 full = os .path .join (root , fname )
@@ -447,6 +513,8 @@ def analyze_local_path_sync(
447513 except Exception :
448514 continue
449515 file_paths .append ({"full" : full , "rel" : rel })
516+
517+ logger .info (f"Found { len (file_paths )} files to process" )
450518
451519 # Process files in chunks to avoid too many futures at once.
452520 CHUNK_SUBMIT = 256
@@ -482,6 +550,9 @@ def analyze_local_path_sync(
482550 end_time = time .time ()
483551 duration = end_time - start_time
484552
553+ # Log summary
554+ logger .info (f"Indexing completed: { file_count } files processed, { emb_count } embeddings created, { skipped_count } files skipped in { duration :.2f} s" )
555+
485556 try :
486557 # Use batch update for efficiency - single database transaction
487558 set_project_metadata_batch (database_path , {
0 commit comments