Skip to content

Commit ba10c1b

Browse files
CopilotMte90
andauthored
Add file processing logs and remove content storage from database (#4)
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
1 parent cda0629 commit ba10c1b

File tree

2 files changed

+78
-6
lines changed

2 files changed

+78
-6
lines changed

ai/analyzer.py

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,16 +255,68 @@ def _search_vectors(database_path: str, q_vector: List[float], top_k: int = 5) -
255255

256256

257257
def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optional[str]:
258+
"""
259+
Get chunk text by reading from filesystem instead of database.
260+
Uses project_path metadata and file path to read the actual file.
261+
"""
258262
conn = _connect_db(database_path)
259263
try:
260264
cur = conn.cursor()
261-
cur.execute("SELECT content FROM files WHERE id = ?", (file_id,))
265+
# Get file path from database
266+
cur.execute("SELECT path FROM files WHERE id = ?", (file_id,))
262267
row = cur.fetchone()
263268
if not row:
269+
logger.warning(f"File not found in database: file_id={file_id}")
270+
return None
271+
272+
file_path = row[0]
273+
if not file_path:
274+
logger.warning(f"File path is empty for file_id={file_id}")
275+
return None
276+
277+
# Get project path from metadata
278+
project_path = get_project_metadata(database_path, "project_path")
279+
if not project_path:
280+
logger.error("Project path not found in metadata, cannot read file from filesystem")
281+
raise RuntimeError("Project path metadata is missing - ensure the indexing process has stored project metadata properly")
282+
283+
# Construct full file path and resolve to absolute path
284+
full_path = os.path.abspath(os.path.join(project_path, file_path))
285+
normalized_project_path = os.path.abspath(project_path)
286+
287+
# Security check: ensure the resolved path is within the project directory
288+
try:
289+
common = os.path.commonpath([full_path, normalized_project_path])
290+
if common != normalized_project_path:
291+
logger.error(f"Path traversal attempt detected: {file_path} resolves outside project directory")
292+
return None
293+
if full_path != normalized_project_path and not full_path.startswith(normalized_project_path + os.sep):
294+
logger.error(f"Path traversal attempt detected: {file_path} does not start with project directory")
295+
return None
296+
except ValueError:
297+
logger.error(f"Path traversal attempt detected: {file_path} is on a different drive or incompatible path")
298+
return None
299+
300+
# Read file content from filesystem
301+
try:
302+
with open(full_path, "r", encoding="utf-8", errors="replace") as fh:
303+
content = fh.read()
304+
except Exception as e:
305+
logger.warning(f"Failed to read file from filesystem: {full_path}, error: {e}")
264306
return None
265-
content = row[0] or ""
307+
308+
if not content:
309+
return None
310+
311+
# Extract the chunk
266312
if CHUNK_SIZE <= 0:
267313
return content
314+
315+
# Validate chunk_index
316+
if chunk_index < 0:
317+
logger.warning(f"Invalid chunk_index {chunk_index} for file_id={file_id}")
318+
return None
319+
268320
step = max(1, CHUNK_SIZE - CHUNK_OVERLAP)
269321
start = chunk_index * step
270322
end = min(start + CHUNK_SIZE, len(content))
@@ -309,8 +361,12 @@ def _process_file_sync(
309361

310362
# Check if file needs reindexing (incremental mode)
311363
if incremental and not needs_reindex(database_path, rel_path, mtime, file_hash):
364+
logger.debug(f"Skipping unchanged file: {rel_path}")
312365
return {"stored": False, "embedded": False, "skipped": True}
313366

367+
# Log file processing
368+
logger.info(f"Processing file: {rel_path}")
369+
314370
# store file (synchronous DB writer) with metadata
315371
try:
316372
fid = store_file(database_path, rel_path, content, lang, mtime, file_hash)
@@ -426,16 +482,26 @@ def analyze_local_path_sync(
426482
Submits per-file tasks to a shared ThreadPoolExecutor.
427483
Supports incremental indexing to skip unchanged files.
428484
"""
485+
from db.operations import set_project_metadata
486+
429487
semaphore = threading.Semaphore(EMBEDDING_CONCURRENCY)
430488
start_time = time.time()
431489

490+
# Store project path in metadata for filesystem access
491+
try:
492+
set_project_metadata(database_path, "project_path", local_path)
493+
logger.info(f"Starting indexing for project at: {local_path}")
494+
except Exception as e:
495+
logger.warning(f"Failed to store project path in metadata: {e}")
496+
432497
try:
433498
file_count = 0
434499
emb_count = 0
435500
skipped_count = 0
436501
file_paths: List[Dict[str, str]] = []
437502

438503
# Collect files to process
504+
logger.info("Collecting files to index...")
439505
for root, dirs, files in os.walk(local_path):
440506
for fname in files:
441507
full = os.path.join(root, fname)
@@ -447,6 +513,8 @@ def analyze_local_path_sync(
447513
except Exception:
448514
continue
449515
file_paths.append({"full": full, "rel": rel})
516+
517+
logger.info(f"Found {len(file_paths)} files to process")
450518

451519
# Process files in chunks to avoid too many futures at once.
452520
CHUNK_SUBMIT = 256
@@ -482,6 +550,9 @@ def analyze_local_path_sync(
482550
end_time = time.time()
483551
duration = end_time - start_time
484552

553+
# Log summary
554+
logger.info(f"Indexing completed: {file_count} files processed, {emb_count} embeddings created, {skipped_count} files skipped in {duration:.2f}s")
555+
485556
try:
486557
# Use batch update for efficiency - single database transaction
487558
set_project_metadata_batch(database_path, {

db/operations.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -233,21 +233,22 @@ def store_file(database_path, path, content, language, last_modified=None, file_
233233
Insert or update a file record into the DB using a queued single-writer to avoid
234234
sqlite 'database is locked' errors in multithreaded scenarios.
235235
Supports incremental indexing with last_modified and file_hash tracking.
236+
Note: Does not store full file content in database (only snippet), content is read from filesystem when needed.
237+
The content parameter is still required to generate the snippet.
236238
Returns lastrowid (same as the previous store_file implementation).
237239
"""
238240
snippet = (content[:512] if content else "")
239241
sql = """
240-
INSERT INTO files (path, content, language, snippet, last_modified, file_hash, updated_at)
241-
VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
242+
INSERT INTO files (path, language, snippet, last_modified, file_hash, updated_at)
243+
VALUES (?, ?, ?, ?, ?, datetime('now'))
242244
ON CONFLICT(path) DO UPDATE SET
243-
content=excluded.content,
244245
language=excluded.language,
245246
snippet=excluded.snippet,
246247
last_modified=excluded.last_modified,
247248
file_hash=excluded.file_hash,
248249
updated_at=datetime('now')
249250
"""
250-
params = (path, content, language, snippet, last_modified, file_hash)
251+
params = (path, language, snippet, last_modified, file_hash)
251252

252253
writer = _get_writer(database_path)
253254
# We wait for the background writer to complete the insert and then return the row id.

0 commit comments

Comments
 (0)