diff --git a/.gitignore b/.gitignore index 9e44d02..0b1c523 100644 --- a/.gitignore +++ b/.gitignore @@ -185,4 +185,5 @@ examples/hf_demo_space/.chainlit/* examples/hf_demo_space/chainlit.md examples/hf_demo_space/public/ +database.db-journal .chainlit/ diff --git a/codetide/agents/tide/agent.py b/codetide/agents/tide/agent.py index ece0e67..cf39e99 100644 --- a/codetide/agents/tide/agent.py +++ b/codetide/agents/tide/agent.py @@ -1,12 +1,13 @@ -from functools import partial from codetide import CodeTide +from codetide.search.code_search import SmartCodeSearch from ...mcp.tools.patch_code import file_exists, open_file, process_patch, remove_file, write_file, parse_patch_blocks from ...core.defaults import DEFAULT_ENCODING, DEFAULT_STORAGE_PATH +from ...parsers import SUPPORTED_LANGUAGES from ...autocomplete import AutoComplete from .models import Steps from .prompts import ( - AGENT_TIDE_SYSTEM_PROMPT, GET_CODE_IDENTIFIERS_SYSTEM_PROMPT, REJECT_PATCH_FEEDBACK_TEMPLATE, - STAGED_DIFFS_TEMPLATE, STEPS_SYSTEM_PROMPT, WRITE_PATCH_SYSTEM_PROMPT + AGENT_TIDE_SYSTEM_PROMPT, CALMNESS_SYSTEM_PROMPT, GET_CODE_IDENTIFIERS_SYSTEM_PROMPT, README_CONTEXT_PROMPT, REJECT_PATCH_FEEDBACK_TEMPLATE, + REPO_TREE_CONTEXT_PROMPT, STAGED_DIFFS_TEMPLATE, STEPS_SYSTEM_PROMPT, WRITE_PATCH_SYSTEM_PROMPT ) from .utils import delete_file, parse_blocks, parse_steps_markdown, trim_to_patch_section from .consts import AGENT_TIDE_ASCII_ART @@ -20,11 +21,12 @@ "Install it with: pip install codetide[agents]" ) from e +from pydantic import BaseModel, Field, ConfigDict, model_validator from prompt_toolkit.key_binding import KeyBindings from prompt_toolkit import PromptSession -from pydantic import BaseModel, Field, model_validator -from typing_extensions import Self from typing import List, Optional, Set +from typing_extensions import Self +from functools import partial from datetime import date from pathlib import Path from ulid import ulid @@ -58,11 +60,37 @@ class AgentTide(BaseModel): _last_code_context :Optional[str] = None _has_patch :bool=False + model_config = ConfigDict(arbitrary_types_allowed=True) + @model_validator(mode="after") def pass_custom_logger_fn(self)->Self: self.llm.logger_fn = partial(custom_logger_fn, session_id=self.session_id, filepath=self.patch_path) return self - + + async def get_repo_tree_from_user_prompt(self, history :list)->str: + + history_str = "\n\n".join([str(entry) for entry in history]) + ### TODO evalutate sending last N messages and giving more importance to + ### search results from latter messages + + nodes_dict = self.tide.codebase.compile_tree_nodes_dict() + nodes_dict = { + filepath: contents for filepath, elements in nodes_dict.items() + if (contents := "\n".join([filepath] + elements).strip()) + } + + codeSearch = SmartCodeSearch(documents=nodes_dict) + await codeSearch.initialize_async() + + results = await codeSearch.search_smart(history_str, top_k=5) + + self.tide.codebase._build_tree_dict([doc_key for doc_key,_ in results] or None) + + return self.tide.codebase.get_tree_view( + include_modules=True, + include_types=True + ) + def approve(self): self._has_patch = False if os.path.exists(self.patch_path): @@ -102,59 +130,65 @@ async def agent_loop(self, codeIdentifiers :Optional[List[str]]=None): # update codetide with the latest changes made by the human and agent await self.tide.check_for_updates(serialize=True, include_cached_ids=True) - repo_tree = self.tide.codebase.get_tree_view( - include_modules=True, - include_types=True - ) - - if codeIdentifiers is None and not self._skip_context_retrieval: - context_response = await self.llm.acomplete( - self.history, - system_prompt=[GET_CODE_IDENTIFIERS_SYSTEM_PROMPT.format(DATE=TODAY)], - prefix_prompt=repo_tree, - stream=False - # json_output=True - ) - - contextIdentifiers = parse_blocks(context_response, block_word="Context Identifiers", multiple=False) - modifyIdentifiers = parse_blocks(context_response, block_word="Modify Identifiers", multiple=False) - - reasoning = context_response.split("*** Begin") - if not reasoning: - reasoning = [context_response] - self.reasoning = reasoning[0].strip() - - self.contextIdentifiers = contextIdentifiers.splitlines() if isinstance(contextIdentifiers, str) else None - self.modifyIdentifiers = modifyIdentifiers.splitlines() if isinstance(modifyIdentifiers, str) else None - codeIdentifiers = self.contextIdentifiers or [] - - if self.modifyIdentifiers: - codeIdentifiers.extend(self.tide._as_file_paths(self.modifyIdentifiers)) - codeContext = None - if codeIdentifiers: - autocomplete = AutoComplete(self.tide.cached_ids) - # Validate each code identifier - validatedCodeIdentifiers = [] - for codeId in codeIdentifiers: - result = autocomplete.validate_code_identifier(codeId) - if result.get("is_valid"): - validatedCodeIdentifiers.append(codeId) + if self._skip_context_retrieval: + ... + else: + if codeIdentifiers is None: + repo_tree = await self.get_repo_tree_from_user_prompt(self.history) + context_response = await self.llm.acomplete( + self.history, + system_prompt=[GET_CODE_IDENTIFIERS_SYSTEM_PROMPT.format(DATE=TODAY, SUPPORTED_LANGUAGES=SUPPORTED_LANGUAGES)], # TODO improve this prompt to handle generic scenarios liek what does my porject do and so on + prefix_prompt=repo_tree, + stream=False + # json_output=True + ) + + contextIdentifiers = parse_blocks(context_response, block_word="Context Identifiers", multiple=False) + modifyIdentifiers = parse_blocks(context_response, block_word="Modify Identifiers", multiple=False) + + reasoning = context_response.split("*** Begin") + if not reasoning: + reasoning = [context_response] + self.reasoning = reasoning[0].strip() + + self.contextIdentifiers = contextIdentifiers.splitlines() if isinstance(contextIdentifiers, str) else None + self.modifyIdentifiers = modifyIdentifiers.splitlines() if isinstance(modifyIdentifiers, str) else None + codeIdentifiers = self.contextIdentifiers or [] - elif result.get("matching_identifiers"): - validatedCodeIdentifiers.append(result.get("matching_identifiers")[0]) + if self.modifyIdentifiers: + codeIdentifiers.extend(self.tide._as_file_paths(self.modifyIdentifiers)) + + if codeIdentifiers: + autocomplete = AutoComplete(self.tide.cached_ids) + # Validate each code identifier + validatedCodeIdentifiers = [] + for codeId in codeIdentifiers: + result = autocomplete.validate_code_identifier(codeId) + if result.get("is_valid"): + validatedCodeIdentifiers.append(codeId) + + elif result.get("matching_identifiers"): + validatedCodeIdentifiers.append(result.get("matching_identifiers")[0]) - self._last_code_identifers = set(validatedCodeIdentifiers) - codeContext = self.tide.get(validatedCodeIdentifiers, as_string=True) - self._last_code_context = codeContext + self._last_code_identifers = set(validatedCodeIdentifiers) + codeContext = self.tide.get(validatedCodeIdentifiers, as_string=True) + + if not codeContext: + codeContext = REPO_TREE_CONTEXT_PROMPT.format(REPO_TREE=self.tide.codebase.get_tree_view()) + readmeFile = self.tide.get("README.md", as_string_list=True) + if readmeFile: + codeContext = "\n".join([codeContext, README_CONTEXT_PROMPT.format(README=readmeFile)]) + self._last_code_context = codeContext await delete_file(self.patch_path) response = await self.llm.acomplete( self.history, system_prompt=[ AGENT_TIDE_SYSTEM_PROMPT.format(DATE=TODAY), - STEPS_SYSTEM_PROMPT.format(DATE=TODAY, REPO_TREE=repo_tree), - WRITE_PATCH_SYSTEM_PROMPT.format(DATE=TODAY) + STEPS_SYSTEM_PROMPT.format(DATE=TODAY), + WRITE_PATCH_SYSTEM_PROMPT.format(DATE=TODAY), + CALMNESS_SYSTEM_PROMPT ], prefix_prompt=codeContext ) diff --git a/codetide/agents/tide/prompts.py b/codetide/agents/tide/prompts.py index 37cb23f..c3287a6 100644 --- a/codetide/agents/tide/prompts.py +++ b/codetide/agents/tide/prompts.py @@ -52,56 +52,87 @@ """ GET_CODE_IDENTIFIERS_SYSTEM_PROMPT = """ -You are Agent **Tide**, operating in **Identifier Resolution Mode** on **{DATE}**. You have received a user request and a visual representation of the code repository structure. Your task is to determine which code-level identifiers (such as functions, classes, methods, variables, or attributes) or, if necessary, file paths are relevant for fulfilling the request. +You are Agent **Tide**, operating in **Identifier Resolution Mode** on **{DATE}**. You have received a user request and a repository tree structure that includes file contents information. +Your task is to determine which code-level identifiers or file paths are relevant for fulfilling the request. +You are operating under a strict **single-call constraint**: the repository tree structure can only be retrieved **once per task**. Do **not** request additional tree information. -You are operating under a strict **single-call constraint**: the repository tree structure (via `getRepoTree()`) can only be retrieved **once per task**, and you must extract maximum value from it. Do **not** request the tree again under any circumstances. +--- + +**SUPPORTED_LANGUAGES** are: {SUPPORTED_LANGUAGES} --- -**Instructions:** +**Core Rules:** + +1. **Language-Based Decision Making:** + - For files in **SUPPORTED_LANGUAGES** (as indicated in the tree): Return **code identifiers** (functions, classes, methods, variables, attributes) + - For files **NOT** in SUPPORTED_LANGUAGES: Return **file paths** only + - Code identifiers should use dot notation (e.g., `module.submodule.Class.method`) without file extensions -1. Carefully read and interpret the user's request, identifying any references to files, modules, submodules, or code elements—either explicit or implied. -2. **Segregate identifiers into two categories:** - - **Context Identifiers:** Code elements (functions, classes, methods, variables, attributes, or file paths) that are required to understand, reference, or provide context for the requested change, but are not themselves expected to be modified. - - **Modify Identifiers:** Code elements (functions, classes, methods, variables, attributes, or file paths) that are likely to require direct modification to fulfill the user's request. -3. **Prioritize returning fully qualified code identifiers** (using dot notation, e.g., `module.submodule.Class.method`), without file extensions. Only include file paths (relative to the repository root) if: - - The user explicitly requests file-level operations (such as adding, deleting, or renaming files), or - - No valid or relevant code identifiers can be determined for the request. -4. If the user refers to a file by name or path and the request is about code elements within that file, extract and include the relevant code identifiers from that file instead of the file path, unless the user specifically asks for the file path. -5. If fulfilling the request would likely depend on additional symbols or files—based on naming, structure, required context from other files/modules, or conventional design patterns—include those code identifiers as context identifiers. -6. Only include identifiers or paths that are present in the provided tree structure. Never fabricate or guess paths or names that do not exist. -7. If no relevant code identifiers or file paths can be confidently identified, leave the relevant section(s) empty - without any contents or lines, not even the word empty. +2. **Identifier Categories:** + - **Context Identifiers:** Elements needed to understand or provide context for the request, but not directly modified + - **Modify Identifiers:** Elements that will likely require direct modification to fulfill the request --- -**Output Format:** +**Step-by-Step Process:** + +1. **Parse the user request** to identify: + - Explicit file/module/code element references + - Implicit requirements based on the task description + - Scope of changes needed (file-level vs code-level) + +2. **Analyze the repository tree** to: + - Locate relevant files and their language support status + - Identify code elements within supported language files + - Map user requirements to actual repository structure + +3. **Apply the language rule:** + - **If file is in SUPPORTED_LANGUAGES:** Extract relevant code identifiers from the parsed content + - **If file is NOT in SUPPORTED_LANGUAGES:** Use the file path instead + - **Exception:** If user explicitly requests file-level operations (create, delete, rename files), return file paths regardless of language -Your response must include: +4. **Include contextual dependencies:** + - Related modules, classes, or functions that provide necessary context + - Configuration files, README, or documentation when dealing with broad/architectural questions + - **When in doubt about scope, always include README for project context** + +--- + +**Special Cases:** + +- **Broad/General Requests:** Include README and relevant config files (pyproject.toml, setup.py, etc.) as context +- **File-Level Operations:** Return file paths even for supported languages when the operation targets the file itself +- **Non-Existent Elements:** Only include identifiers/paths that actually exist in the provided tree structure +- **Empty Results:** Leave sections completely empty (no placeholder text) if no relevant identifiers are found + +--- + +**Output Format:** -1. A brief explanation (1-3 sentences) describing your reasoning and search process for selecting the identifiers. -2. The following delimited sections, each containing a newline-separated list of identifiers (or left empty if none): +Provide: +1. **Brief explanation** (1-3 sentences) of your selection reasoning +2. **Delimited sections** with newline-separated lists: *** Begin Context Identifiers - + *** End Context Identifiers *** Begin Modify Identifiers - + *** End Modify Identifiers -Do **not** include any additional commentary, formatting, or output outside these sections. +**No additional output** beyond these sections. --- -**Evaluation Criteria:** - -- You must identify all code identifiers directly referenced or implied in the user request, and correctly categorize them as context or modify identifiers. -- You must include any internal code elements that are clearly involved or required for the task. -- You must consider logical dependencies that may need to be modified together (e.g., helper modules, config files, related class methods). -- You must consider files that can be relevant as context to complete the user request, but only include their paths if code identifiers are not available or explicitly requested. -- You must return a clean and complete list of all relevant code identifiers and, if necessary, file paths, in the correct section. -- Do not over-include; be minimal but thorough. Return only what is truly required. - +**Quality Checklist:** +- ✓ Applied language-based rule correctly (identifiers for supported languages, paths for others) +- ✓ Categorized identifiers appropriately (context vs modify) +- ✓ Included necessary dependencies and context +- ✓ Verified all items exist in the repository tree +- ✓ Used proper dot notation for code identifiers +- ✓ Kept output minimal but complete """ ASSISTANT_SYSTEM_PROMPT = """ @@ -214,7 +245,10 @@ * Inside each file patch: * Use one or more @@ context headers to uniquely identify the code location - * Include exactly 3 lines of context above the change + * Include exactly 3 lines of context below the change as well + * The combination of context above + changed lines + context below must create a UNIQUE match in the file + * If the context pattern appears multiple times in the file, add more distinctive context lines until the location is unambiguous + * Context lines must form a contiguous block that exists nowhere else in the file with the same sequence * For insertions (where no lines are being removed), always provide the 3 lines of real, unaltered context above the insertion point, as they appear in the original file. This ensures the patch can be applied unambiguously and in the correct location. @@ -240,6 +274,10 @@ * Start with + * Contribute to achieve the user request according to the plain reasoning step you have previoulsy produced +* AMBIGUITY CHECK: Before finalizing any patch, verify that the context + change pattern appears exactly once in the target file + * If multiple matches are possible, expand the context window until the patch location is unique + * Context must be sufficient to unambiguously identify the exact insertion/modification point + --- **IMPORTS AND CLASS STRUCTURE RULES:** @@ -274,9 +312,10 @@ 1. Validate that every line you edit exists exactly as-is in the original context 2. Ensure one patch block per file, using multiple @@ hunks as needed 3. Include no formatting, layout, or interpretation changes -4. Ensure every @@ header is a valid, real, byte-identical line from the original file -5. Match the `MANDATORY PATCH FORMAT (V4A-Compatible)` structure expectations exactly -6. Ensure each patch line starts with a `@`, `+`, `-` or ` ` +4. Verify patch location uniqueness: ensure the context pattern (lines above + changed content + lines below) appears exactly once in the file +5. Ensure every @@ header is a valid, real, byte-identical line from the original file +6. Match the `MANDATORY PATCH FORMAT (V4A-Compatible)` structure expectations exactly +7. Ensure each patch line starts with a `@`, `+`, `-` or ` ` This is a surgical, precision editing mode. You must mirror source files exactly — no assumptions, no reformatting, no transformations. @@ -336,6 +375,28 @@ 10. **Succinctness of Format:** Strictly adhere to the step formatting with separators (`---`) and the beginning/end markers. Do not add extraneous numbering or narrative outside the prescribed structure. """ +CALMNESS_SYSTEM_PROMPT = """ +Remain calm and do not rush into execution if the user's request is ambiguous, lacks sufficient context, or is not explicit enough to proceed safely. + +If you do not have all the information you need, or if any part of the request is unclear, you must pause and explicitly request the necessary context or clarification from the user before taking any action. + +Never make assumptions or proceed with incomplete information. Your priority is to ensure that every action is based on clear, explicit, and sufficient instructions. +""" + +REPO_TREE_CONTEXT_PROMPT = """ +Here is a **tree representation of current state of the codebase** - you can refer to if needed: + +{REPO_TREE} + +""" + +README_CONTEXT_PROMPT = """ +Here is the README of the project for further context: + +{README} + +""" + CMD_TRIGGER_PLANNING_STEPS = """ You must operate in a multi-step planning and execution mode: first outline the plan step by step in a sequential way, then ask for my revision. Do not start implementing the steps without my approval. diff --git a/codetide/agents/tide/ui/public/elements/ReasoningMessage.jsx b/codetide/agents/tide/ui/public/elements/ReasoningMessage.jsx index 0dbcedb..3bba731 100644 --- a/codetide/agents/tide/ui/public/elements/ReasoningMessage.jsx +++ b/codetide/agents/tide/ui/public/elements/ReasoningMessage.jsx @@ -55,19 +55,26 @@ export default function ReasoningMessage() { return (
-
+
{ + if (e.key === "Enter" || e.key === " ") { + e.preventDefault(); + toggleExpanded(); + } + }} + >

{title}

- +

{summaryText}

@@ -75,14 +82,14 @@ export default function ReasoningMessage() {
- + {isExpanded && ( -
)} diff --git a/codetide/autocomplete.py b/codetide/autocomplete.py index b06e118..80a13fc 100644 --- a/codetide/autocomplete.py +++ b/codetide/autocomplete.py @@ -1,5 +1,7 @@ from typing import List import difflib +import os + class AutoComplete: def __init__(self, word_list: List[str]) -> None: """Initialize with a list of strings to search from""" @@ -145,7 +147,6 @@ def validate_paths(self, file_paths): Raises: ValueError: If a path cannot be matched to a valid entry. """ - import os valid_paths = [] valid_set = set(self.words) for path in file_paths: diff --git a/codetide/core/models.py b/codetide/core/models.py index d1505aa..28362d6 100644 --- a/codetide/core/models.py +++ b/codetide/core/models.py @@ -539,7 +539,8 @@ def from_list_of_elements(cls, class CodeBase(BaseModel): """Root model representing complete codebase with file hierarchy and caching.""" root: List[CodeFileModel] = Field(default_factory=list) - _cached_elements :Dict[str, Union[CodeFileModel, ClassDefinition, FunctionDefinition, VariableDeclaration, ImportStatement]] = dict() + _cached_elements :Dict[str, Union[CodeFileModel, ClassDefinition, FunctionDefinition, VariableDeclaration, ImportStatement]] = dict() + _tree_dict :Optional[Dict[str, Any]] = None @property def cached_elements(self)->Dict[str, Union[CodeFileModel, ClassDefinition, FunctionDefinition, VariableDeclaration, ImportStatement]]: @@ -547,6 +548,12 @@ def cached_elements(self)->Dict[str, Union[CodeFileModel, ClassDefinition, Funct self._build_cached_elements() return self._cached_elements + @property + def tree_dict(self)->Dict[str, Any]: + if self._tree_dict is None: + self._build_tree_dict() + return self._tree_dict + def _build_cached_elements(self, force_update :bool=False): """Builds cache of all elements with unique IDs across entire codebase.""" @@ -630,23 +637,67 @@ def get_import(self, unique_id :str)->Optional[ImportStatement]: return match def get_tree_view(self, include_modules: bool = False, include_types: bool = False) -> str: - """Generates ASCII tree view of codebase structure with optional details.""" - - # Build the nested structure first - tree_dict = self._build_tree_dict() - + """Generates ASCII tree view of codebase structure with optional details""" # Convert to ASCII tree lines = [] - self._render_tree_node(tree_dict, "", True, lines, include_modules, include_types) + self._render_tree_node(self.tree_dict, "", True, lines, include_modules, include_types) return "\n".join(lines) - def _build_tree_dict(self) -> dict: - """Creates nested dictionary representing codebase directory structure.""" + def _build_tree_dict(self, filter_paths: list = None): + """Creates nested dictionary representing codebase directory structure with optional filtering.""" tree = {} - for code_file in self.root: + # If no filter paths provided, include all files (original behavior) + if filter_paths is None: + relevant_files = self.root + sibling_files = [] + else: + # Convert filter paths to normalized format for comparison + normalized_filter_paths = set() + filter_directories = set() + + for path in filter_paths: + normalized_path = path.replace("\\", "/") + normalized_filter_paths.add(normalized_path) + + # Extract directory path for this file + path_parts = normalized_path.split("/") + if len(path_parts) > 1: + dir_path = "/".join(path_parts[:-1]) + filter_directories.add(dir_path) + else: + # File is at root level + filter_directories.add("") + + # Find all files that are siblings (in the same directories as filtered files) + relevant_files = [] # Files that should show full content + sibling_files = [] # Files that should show as siblings only + + for code_file in self.root: + if not code_file.file_path: + continue + + normalized_file_path = code_file.file_path.replace("\\", "/") + + # Check if this is a filtered file (should show full content) + if normalized_file_path in normalized_filter_paths: + relevant_files.append(code_file) + continue + + # Check if this file is a sibling of any filtered file + file_parts = normalized_file_path.split("/") + if len(file_parts) > 1: + file_dir = "/".join(file_parts[:-1]) + else: + file_dir = "" + + if file_dir in filter_directories: + sibling_files.append(code_file) + + # Build tree structure from relevant files (with full content) + for code_file in relevant_files: if not code_file.file_path: continue @@ -657,19 +708,105 @@ def _build_tree_dict(self) -> dict: current_level = tree for i, part in enumerate(path_parts): if i == len(path_parts) - 1: # This is the file - current_level[part] = {"_type": "file", "_data": code_file} + current_level[part] = {"_type": "file", "_data": code_file, "_show_content": True} else: # This is a directory if part not in current_level: current_level[part] = {"_type": "directory"} current_level = current_level[part] + # Add sibling files (without full content) + for code_file in sibling_files: + if not code_file.file_path: + continue + + # Split the file path into parts + path_parts = code_file.file_path.replace("\\", "/").split("/") + + # Navigate/create the nested dictionary structure + current_level = tree + for i, part in enumerate(path_parts): + if i == len(path_parts) - 1: # This is the file + current_level[part] = {"_type": "file", "_data": code_file, "_show_content": True} + else: # This is a directory + if part not in current_level: + current_level[part] = {"_type": "directory"} + current_level = current_level[part] + + # Add placeholder for omitted content when filtering is applied + if filter_paths is not None: + tree = self._add_omitted_placeholders(tree, filter_paths) + + self._tree_dict = tree + + def _add_omitted_placeholders(self, tree: dict, filter_paths: list) -> dict: + """Adds '...' placeholders for directories that contain omitted files.""" + + # Get all unique directory paths from the full codebase + all_dirs = set() + for code_file in self.root: + if code_file.file_path: + path_parts = code_file.file_path.replace("\\", "/").split("/") + for i in range(len(path_parts) - 1): # Exclude the file itself + dir_path = "/".join(path_parts[:i+1]) + all_dirs.add(dir_path) + + # Get directories that should be shown (contain filtered files) + shown_dirs = set() + for filter_path in filter_paths: + path_parts = filter_path.replace("\\", "/").split("/") + for i in range(len(path_parts) - 1): + dir_path = "/".join(path_parts[:i+1]) + shown_dirs.add(dir_path) + + # Find directories that exist but aren't shown + omitted_dirs = all_dirs - shown_dirs + + # Add placeholders for omitted directories + def add_placeholders_recursive(current_tree: dict, current_path: str = ""): + # Check if any omitted directories should be represented at this level + for omitted_dir in omitted_dirs: + omitted_parts = omitted_dir.split("/") + current_parts = current_path.split("/") if current_path else [] + + # Check if this omitted directory is a direct child of current path + if (len(omitted_parts) == len(current_parts) + 1 and + omitted_dir.startswith(current_path) and + (not current_path or omitted_dir.startswith(current_path + "/"))): + + # Check if we don't already have this directory or a placeholder + dir_name = omitted_parts[-1] + has_content_in_dir = any(k for k in current_tree.keys() + if not k.startswith("_") and k != "...") + + if dir_name not in current_tree and has_content_in_dir: + current_tree["..."] = {"_type": "placeholder"} + break + + # Recursively process subdirectories + for key, value in current_tree.items(): + if not key.startswith("_") and key != "..." and value.get("_type") == "directory": + new_path = f"{current_path}/{key}" if current_path else key + add_placeholders_recursive(value, new_path) + + add_placeholders_recursive(tree) return tree def _render_tree_node(self, node: dict, prefix: str, is_last: bool, lines: list, include_modules: bool, include_types: bool, depth: int = 0): """Recursively renders tree node with ASCII art and optional type prefixes.""" items = [(k, v) for k, v in node.items() if not k.startswith("_")] - items.sort(key=lambda x: (x[1].get("_type", "directory") == "file", x[0])) + + # Sort items: directories first, then files, with "..." placeholders at the end + def sort_key(x): + name, data = x + if name == "...": + return (2, name) # Placeholders last + elif data.get("_type") == "file": + return (1, name) # Files second + else: + return (0, name) # Directories first + + items.sort(key=sort_key) for i, (name, data) in enumerate(items): is_last_item = i == len(items) - 1 @@ -682,20 +819,27 @@ def _render_tree_node(self, node: dict, prefix: str, is_last: bool, lines: list, current_prefix = "├── " next_prefix = prefix + "│ " + # Handle placeholder + if name == "...": + lines.append(f"{prefix}{current_prefix}...") + continue + # Determine display name with optional type prefix display_name = name if include_types: if data.get("_type") == "file": - display_name = f" {name}" + display_name = f"📄 {name}" else: - display_name = f"{name}" + display_name = f"📁 {name}" lines.append(f"{prefix}{current_prefix}{display_name}") # Handle file contents if requested if data.get("_type") == "file" and include_modules: - code_file = data["_data"] - self._render_file_contents(code_file, next_prefix, lines, include_types) + # Only show content for files that should show full content + if data.get("_show_content", True): # Default True for backward compatibility + code_file = data["_data"] + self._render_file_contents(code_file, next_prefix, lines, include_types) elif data.get("_type") != "file": # This is a directory - recursively render its contents self._render_tree_node(data, next_prefix, is_last_item, lines, @@ -773,6 +917,99 @@ def _render_class_contents(self, class_def: 'ClassDefinition', prefix: str, lines.append(f"{prefix}{current_prefix}{name}") + def compile_tree_nodes_dict(self) -> dict: + """ + Compiles a dictionary where each entry is a node of the repo tree up to file level. + Keys are paths, values are lists of direct children identifiers. + + For directories: contains subdirectory names and file names + For files: contains element names (variables, functions, classes) + + Returns: + dict: {path: [list_of_direct_children_identifiers]} + """ + + # First build the tree structure + tree = self.tree_dict + + # Dictionary to store the flattened node structure + nodes_dict = {} + + def traverse_tree(node_dict: dict, current_path: str = ""): + """Recursively traverse the tree and collect node information.""" + + # Get all items that aren't metadata (don't start with _) + items = [(k, v) for k, v in node_dict.items() if not k.startswith("_")] + + # Collect direct children for current node + children = [] + + for name, data in items: + # Skip placeholder entries + if name == "...": + children.append("...") + continue + + if data.get("_type") == "file": + # For files, add the filename to current directory's children + children.append(name) + + # Create entry for the file itself with its elements + file_path = f"{current_path}/{name}" if current_path else name + file_elements = [] + + # Only process file contents if we should show content + if data.get("_show_content", True): + code_file = data["_data"] + + # Add variables + for variable in code_file.variables: + file_elements.append(variable.name) + + # Add functions + for function in code_file.functions: + file_elements.append(function.name) + + # Add classes + for class_def in code_file.classes: + file_elements.append(class_def.name) + + # Create entry for each class with its members + class_path = f"{file_path}::{class_def.name}" + class_members = [] + + # Add class attributes + for attribute in class_def.attributes: + class_members.append(attribute.name) + + # Add class methods + for method in class_def.methods: + class_members.append(method.name) + + nodes_dict[class_path] = class_members + + nodes_dict[file_path] = file_elements + + elif data.get("_type") == "directory": + # For directories, add the directory name to current node's children + children.append(name) + + # Recursively process the directory + dir_path = f"{current_path}/{name}" if current_path else name + traverse_tree(data, dir_path) + + elif data.get("_type") == "placeholder": + # Handle placeholder + children.append("...") + + # Store children for current path + nodes_dict[current_path if current_path else "."] = children + + # Start traversal from root + traverse_tree(tree) + + return nodes_dict + def get(self, unique_id :Union[str, List[str]], degree :int=1, diff --git a/codetide/mcp/tools/patch_code/__init__.py b/codetide/mcp/tools/patch_code/__init__.py index fe47591..f7e66a6 100644 --- a/codetide/mcp/tools/patch_code/__init__.py +++ b/codetide/mcp/tools/patch_code/__init__.py @@ -207,9 +207,7 @@ def process_patch( # Normalize line endings before processing patches_text = open_fn(patch_path) - print(f"{patches_text=}") patches = parse_patch_blocks(patches_text)#or [""] - print(f"{patches=}") all_paths_needed = [] for text in patches: diff --git a/codetide/parsers/__init__.py b/codetide/parsers/__init__.py index 9c5b125..eae9617 100644 --- a/codetide/parsers/__init__.py +++ b/codetide/parsers/__init__.py @@ -8,4 +8,9 @@ "PythonParser", "TypescriptParser", "BaseParser" +] + +SUPPORTED_LANGUAGES = [ + "Python", + "Typescript", ] \ No newline at end of file diff --git a/codetide/search/code_search.py b/codetide/search/code_search.py new file mode 100644 index 0000000..1cb1b73 --- /dev/null +++ b/codetide/search/code_search.py @@ -0,0 +1,358 @@ +from codetide.search.preprocessor import CodeQueryPreprocessor +from codetide.search.engine import AsyncFastCodeSearchIndex + +from typing import Dict, List, Tuple, Optional +from collections import defaultdict +import asyncio + +class SmartCodeSearch: + """ + High-level interface for intelligent code search with preprocessing. + """ + + def __init__(self, + documents: Optional[Dict[str, str]] = None, + index_path: Optional[str] = None, + max_workers: Optional[int] = None, + preprocess_documents: bool = False): + """ + Initialize the smart code search. + + Args: + documents: Dictionary of documents to index + index_path: Path to load existing index from + max_workers: Number of workers for parallel processing + preprocess_documents: Whether to preprocess documents during indexing + """ + self.preprocessor = CodeQueryPreprocessor() + self.max_workers = max_workers + self.preprocess_documents = preprocess_documents + + # Initialize the search index + if index_path: + self.search_index = None # Will be loaded async + self.index_path = index_path + elif documents: + processed_docs = documents + if preprocess_documents: + processed_docs = { + key: self._preprocess_document_content(content) + for key, content in documents.items() + } + + self.search_index = AsyncFastCodeSearchIndex(processed_docs, max_workers) + self.index_path = None + else: + raise ValueError("Must provide either documents or index_path") + + self.ready = False + + def _preprocess_document_content(self, content: str) -> str: + """Preprocess document content for better indexing""" + # For documents, we want to preserve structure but add searchable variations + lines = content.split('\n') + processed_lines = [] + + for line in lines: + processed_lines.append(line) # Keep original + + # Add preprocessed version for better matching + preprocessed = self.preprocessor.preprocess_query( + line, + expand_case=True, + expand_abbreviations=False, # Don't expand in documents + apply_stemming=False, + remove_stop_words=False + ) + + if preprocessed and preprocessed != line.lower(): + processed_lines.append(f" {preprocessed}") # Add as searchable content + + return '\n'.join(processed_lines) + + async def initialize_async(self): + """Initialize the search index asynchronously""" + if self.index_path: + self.search_index = await AsyncFastCodeSearchIndex.load_index_async( + self.index_path, max_workers=self.max_workers + ) + else: + await self.search_index.build_index_async() + + self.ready = True + + async def search_smart(self, + query: str, + top_k: int = 10, + use_variations: bool = True, + exact_match_boost: float = 0.3) -> List[Tuple[str, float]]: + """ + Perform intelligent search with query preprocessing and multiple strategies. + + Args: + query: Raw user query + top_k: Number of top results to return + use_variations: Whether to use multiple query variations + exact_match_boost: Boost factor for exact matches + + Returns: + List of (document_key, score) tuples + """ + if not self.ready: + raise RuntimeError("Search index not ready. Call initialize_async() first.") + + if not query or not query.strip(): + return [] + + # Generate query variations + if use_variations: + queries = self.preprocessor.generate_query_variations(query.strip()) + else: + queries = [self.preprocessor.preprocess_query(query.strip())] + + # Remove empty queries + queries = [q for q in queries if q and q.strip()] + + if not queries: + return [] + + # Run multiple searches concurrently + search_tasks = [] + + # Regular searches with different query variations + for q in queries: + search_tasks.append(self.search_index.search_async(q, top_k * 2)) + + # Exact match search for the original query + if exact_match_boost > 0: + search_tasks.append( + self.search_index.search_exact_match_async(query.strip(), top_k) + ) + + # Execute all searches concurrently + all_results = await asyncio.gather(*search_tasks) + + # Combine and score results + combined_scores = defaultdict(float) + result_counts = defaultdict(int) + + # Weight different query variations + for i, results in enumerate(all_results[:-1] if exact_match_boost > 0 else all_results): + weight = 1.0 / (i + 1) # First query gets highest weight + for doc_key, score in results: + combined_scores[doc_key] += score * weight + result_counts[doc_key] += 1 + + # Add exact match boost + if exact_match_boost > 0 and len(all_results) > len(queries): + exact_results = all_results[-1] + for doc_key, score in exact_results: + combined_scores[doc_key] += score * exact_match_boost + result_counts[doc_key] += 1 + + # Normalize scores by appearance frequency and sort + final_scores = [ + (doc_key, score / result_counts[doc_key]) + for doc_key, score in combined_scores.items() + ] + + return sorted(final_scores, key=lambda x: x[1], reverse=True)[:top_k] + + async def search_with_context(self, + query: str, + top_k: int = 10, + context_lines: int = 2) -> List[Dict]: + """ + Search with context lines around matches. + + Returns: + List of dictionaries with doc_key, score, and context + """ + results = await self.search_smart(query, top_k) + + enriched_results = [] + for doc_key, score in results: + if doc_key in self.search_index.documents: + content = self.search_index.documents[doc_key] + # Simple context extraction (could be enhanced) + lines = content.split('\n') + context = lines[:min(context_lines * 2, len(lines))] + + enriched_results.append({ + 'doc_key': doc_key, + 'score': score, + 'context': '\n'.join(context), + 'total_lines': len(lines) + }) + + return enriched_results + + async def update_document(self, doc_key: str, content: str): + """Update a document with preprocessing""" + processed_content = content + if self.preprocess_documents: + processed_content = self._preprocess_document_content(content) + + await self.search_index.update_document_async(doc_key, processed_content) + + async def batch_update_documents(self, updates: Dict[str, str]): + """Update multiple documents with preprocessing""" + processed_updates = updates + if self.preprocess_documents: + processed_updates = { + key: self._preprocess_document_content(content) + for key, content in updates.items() + } + + await self.search_index.batch_update_documents_async(processed_updates) + + async def save_index(self, filepath: str): + """Save the search index""" + await self.search_index.save_index_async(filepath) + + def get_stats(self) -> Dict: + """Get search index statistics""" + stats = self.search_index.get_stats() if self.search_index else {} + stats['preprocessor_cache_size'] = { + 'camel_case': self.preprocessor._expand_camel_case.cache_info().currsize, + 'snake_kebab': self.preprocessor._expand_snake_kebab.cache_info().currsize, + 'stemming': self.preprocessor._simple_stem.cache_info().currsize + } + return stats + + +# Synchronous wrapper +class SmartCodeSearchSync: + """Synchronous wrapper for SmartCodeSearch""" + + def __init__(self, + documents: Optional[Dict[str, str]] = None, + index_path: Optional[str] = None, + max_workers: Optional[int] = None, + preprocess_documents: bool = False): + + self.async_search = SmartCodeSearch( + documents, index_path, max_workers, preprocess_documents + ) + + # Initialize synchronously + asyncio.run(self.async_search.initialize_async()) + + def search(self, query: str, top_k: int = 10, use_variations: bool = True) -> List[Tuple[str, float]]: + """Synchronous smart search""" + return asyncio.run(self.async_search.search_smart(query, top_k, use_variations)) + + def search_with_context(self, query: str, top_k: int = 10, context_lines: int = 2) -> List[Dict]: + """Synchronous search with context""" + return asyncio.run(self.async_search.search_with_context(query, top_k, context_lines)) + + def update_document(self, doc_key: str, content: str): + """Synchronous document update""" + asyncio.run(self.async_search.update_document(doc_key, content)) + + def save_index(self, filepath: str): + """Synchronous index save""" + asyncio.run(self.async_search.save_index(filepath)) + + def get_stats(self) -> Dict: + """Get statistics""" + return self.async_search.get_stats() + + +# Example usage and testing +async def demo_smart_search(): + """Demonstrate the smart code search functionality""" + + # Sample code documents + documents = { + "user_manager.py": """ + class UserManager: + def __init__(self): + self.users = [] + + def getUserByEmail(self, email): + return self.find_user_by_email(email) + + def find_user_by_email(self, email_address): + for user in self.users: + if user.email == email_address: + return user + return None + """, + + "api_controller.js": """ + const APIController = { + async handleUserRequest(req, res) { + const userData = await this.processUserData(req.body); + res.json(userData); + }, + + processUserData: function(data) { + return validateUserInput(data); + } + }; + """, + + "database_config.py": """ + DB_CONFIG = { + 'host': 'localhost', + 'port': 5432, + 'database': 'myapp', + 'user': 'admin', + 'password': 'secret' + } + + class DatabaseManager: + def __init__(self, config): + self.cfg = config + self.connection = None + """, + + "utils/string_helpers.py": """ + def camelCaseToSnake(input_string): + return re.sub('([A-Z])', r'_\1', input_string).lower() + + def snake_case_to_camel(snake_str): + components = snake_str.split('_') + return components[0] + ''.join(x.title() for x in components[1:]) + """ + } + + # Test the smart search + print("\n=== SMART CODE SEARCH DEMO ===") + search = SmartCodeSearch(documents) + await search.initialize_async() + + search_queries = [ + "getUserByEmail", + "find user email", + "API controller", + "db config", + "camel snake conversion", + "Hi lets update the DataBaseManager!" + ] + + for query in search_queries: + print(f"\n--- Searching for: '{query}' ---") + results = await search.search_smart(query, top_k=3) + + for doc_key, score in results: + print(f" {score:.3f}: {doc_key}") + + # Test with context + print("\n=== SEARCH WITH CONTEXT ===") + context_results = await search.search_with_context("user email", top_k=2, context_lines=3) + + for result in context_results: + print(f"\n{result['doc_key']} (score: {result['score']:.3f}):") + print(result['context'][:200] + "..." if len(result['context']) > 200 else result['context']) + + # Show stats + print("\n=== STATS ===") + stats = search.get_stats() + for key, value in stats.items(): + print(f"{key}: {value}") + +if __name__ == "__main__": + + asyncio.run(demo_smart_search()) \ No newline at end of file diff --git a/codetide/search/engine.py b/codetide/search/engine.py new file mode 100644 index 0000000..1ff1010 --- /dev/null +++ b/codetide/search/engine.py @@ -0,0 +1,767 @@ +import pickle +import asyncio +from collections import defaultdict, Counter +import math +import re +from typing import Dict, List, Tuple, Set, Optional +from concurrent.futures import ThreadPoolExecutor +from codetide.core.logs import logger + +class AsyncFastCodeSearchIndex: + def __init__(self, documents: Dict[str, str], max_workers: Optional[int] = None): + """ + documents: {key: content} where key is your filepath/identifier + max_workers: Number of workers for parallel processing (defaults to CPU count) + """ + self.documents = documents + self.doc_keys = list(documents.keys()) + self.N = len(documents) + self.max_workers = max_workers + self.executor = ThreadPoolExecutor(max_workers=max_workers) + + # Will be set during index building + self.index_ready = False + + async def build_index_async(self): + """Build index asynchronously with parallel processing""" + logger.info(f"Building search index async with {self.max_workers or 'default'} workers...") + + # Split documents into chunks for parallel processing + chunk_size = max(1, len(self.documents) // (self.max_workers or 4)) + doc_items = list(self.documents.items()) + chunks = [doc_items[i:i + chunk_size] for i in range(0, len(doc_items), chunk_size)] + + # Process chunks in parallel + loop = asyncio.get_event_loop() + + # Tokenize and count terms in parallel + tokenize_tasks = [ + loop.run_in_executor(self.executor, self._process_chunk, chunk) + for chunk in chunks + ] + + chunk_results = await asyncio.gather(*tokenize_tasks) + + # Merge results from all chunks + await self._merge_chunk_results(chunk_results) + + self.index_ready = True + logger.info(f"Async index built for {self.N} documents with {len(self.idf_scores)} unique terms") + + def _process_chunk(self, chunk: List[Tuple[str, str]]) -> Dict: + """Process a chunk of documents (runs in thread)""" + if not hasattr(self, '_token_pattern'): + self._token_pattern = re.compile(r'\b\w+\b') + + chunk_data = { + 'tokenized_docs': {}, + 'doc_lengths': {}, + 'doc_term_counts': {}, + 'term_doc_freq': defaultdict(int), + 'all_terms': set() + } + + for doc_key, content in chunk: + tokens = self._token_pattern.findall(content.lower()) + term_counts = Counter(tokens) + + chunk_data['tokenized_docs'][doc_key] = tokens + chunk_data['doc_lengths'][doc_key] = len(tokens) + chunk_data['doc_term_counts'][doc_key] = term_counts + + unique_terms = term_counts.keys() + chunk_data['all_terms'].update(unique_terms) + for term in unique_terms: + chunk_data['term_doc_freq'][term] += 1 + + return chunk_data + + async def _merge_chunk_results(self, chunk_results: List[Dict]): + """Merge results from parallel chunk processing""" + # Initialize combined data structures + self.tokenized_docs = {} + self.doc_lengths = {} + self.doc_term_counts = {} + term_doc_freq = defaultdict(int) + all_terms = set() + + # Merge all chunks + for chunk_data in chunk_results: + self.tokenized_docs.update(chunk_data['tokenized_docs']) + self.doc_lengths.update(chunk_data['doc_lengths']) + self.doc_term_counts.update(chunk_data['doc_term_counts']) + all_terms.update(chunk_data['all_terms']) + + for term, freq in chunk_data['term_doc_freq'].items(): + term_doc_freq[term] += freq + + # Compute IDF scores + loop = asyncio.get_event_loop() + self.idf_scores = await loop.run_in_executor( + self.executor, + self._compute_idf_scores, + all_terms, + term_doc_freq + ) + + # Compute TF scores in parallel + tf_tasks = [] + chunk_size = max(1, len(self.doc_keys) // (self.max_workers or 4)) + for i in range(0, len(self.doc_keys), chunk_size): + doc_chunk = self.doc_keys[i:i + chunk_size] + tf_tasks.append( + loop.run_in_executor( + self.executor, + self._compute_tf_scores_chunk, + doc_chunk + ) + ) + + tf_results = await asyncio.gather(*tf_tasks) + + # Merge TF scores + self.tf_scores = {} + for tf_chunk in tf_results: + self.tf_scores.update(tf_chunk) + + # BM25 parameters + self.k1 = 1.5 + self.b = 0.75 + self.avg_doc_length = sum(self.doc_lengths.values()) / self.N + + # Build inverted index + self.inverted_index = await loop.run_in_executor( + self.executor, + self._build_inverted_index + ) + + def _compute_idf_scores(self, all_terms: Set[str], term_doc_freq: Dict[str, int]) -> Dict[str, float]: + """Compute IDF scores (runs in thread)""" + return { + term: math.log(self.N / freq) + for term, freq in term_doc_freq.items() + } + + def _compute_tf_scores_chunk(self, doc_keys: List[str]) -> Dict[str, Dict[str, float]]: + """Compute TF scores for a chunk of documents""" + tf_scores = {} + for doc_key in doc_keys: + term_counts = self.doc_term_counts[doc_key] + doc_length = self.doc_lengths[doc_key] + tf_scores[doc_key] = { + term: count / doc_length + for term, count in term_counts.items() + } + return tf_scores + + def _build_inverted_index(self) -> defaultdict: + """Build inverted index (runs in thread)""" + inverted_index = defaultdict(set) + for doc_key, tokens in self.tokenized_docs.items(): + for term in set(tokens): + inverted_index[term].add(doc_key) + return inverted_index + + def _process_single_document(self, doc_key: str, content: str) -> Dict: + """Process a single document for updating (runs in thread)""" + if not hasattr(self, '_token_pattern'): + self._token_pattern = re.compile(r'\b\w+\b') + + tokens = self._token_pattern.findall(content.lower()) + term_counts = Counter(tokens) + doc_length = len(tokens) + + return { + 'doc_key': doc_key, + 'tokens': tokens, + 'term_counts': term_counts, + 'doc_length': doc_length, + 'unique_terms': set(term_counts.keys()) + } + + async def _remove_document_from_index(self, doc_key: str): + """Remove document data from all indexes""" + if doc_key not in self.doc_keys: + return + + # Get old document terms for cleanup + old_terms = set(self.doc_term_counts.get(doc_key, {}).keys()) + + # Remove from inverted index + for term in old_terms: + if term in self.inverted_index: + self.inverted_index[term].discard(doc_key) + if not self.inverted_index[term]: # Remove empty sets + del self.inverted_index[term] + + # Remove from all document-specific indexes + self.tokenized_docs.pop(doc_key, None) + self.doc_lengths.pop(doc_key, None) + self.doc_term_counts.pop(doc_key, None) + self.tf_scores.pop(doc_key, None) + + # Remove from document keys and update count + if doc_key in self.doc_keys: + self.doc_keys.remove(doc_key) + self.N -= 1 + + # Recalculate average document length + if self.N > 0: + self.avg_doc_length = sum(self.doc_lengths.values()) / self.N + else: + self.avg_doc_length = 0 + + async def _integrate_document_data(self, doc_data: Dict, is_update: bool): + """Integrate new document data into indexes""" + doc_key = doc_data['doc_key'] + tokens = doc_data['tokens'] + term_counts = doc_data['term_counts'] + doc_length = doc_data['doc_length'] + unique_terms = doc_data['unique_terms'] + + # Update document-specific data + self.tokenized_docs[doc_key] = tokens + self.doc_lengths[doc_key] = doc_length + self.doc_term_counts[doc_key] = term_counts + self.tf_scores[doc_key] = { + term: count / doc_length for term, count in term_counts.items() + } + + # Update inverted index + for term in unique_terms: + self.inverted_index[term].add(doc_key) + + # Update IDF scores for new terms + for term in unique_terms: + if term not in self.idf_scores: + # Count how many documents contain this term + doc_freq = len(self.inverted_index[term]) + self.idf_scores[term] = math.log(self.N / doc_freq) + + # Recalculate average document length + self.avg_doc_length = sum(self.doc_lengths.values()) / self.N + + # For efficiency, we could recalculate IDF scores for all terms periodically + # rather than on every update, but this keeps things simple and correct + + async def update_document_async(self, doc_key: str, new_content: str): + """ + Update or add a single document to the index efficiently + """ + if not self.index_ready: + raise RuntimeError("Index not ready. Call build_index_async() first.") + + logger.info(f"Updating document: {doc_key}") + + # Check if this is an update or insert + is_update = doc_key in self.doc_keys + + # Remove old document data if updating + if is_update: + await self._remove_document_from_index(doc_key) + else: + # Add to document list for new documents + self.doc_keys.append(doc_key) + self.N += 1 + + # Update documents dict + self.documents[doc_key] = new_content + + # Process new content + loop = asyncio.get_event_loop() + doc_data = await loop.run_in_executor( + self.executor, + self._process_single_document, + doc_key, + new_content + ) + + # Update indexes with new data + await self._integrate_document_data(doc_data, is_update) + + logger.info(f"Document {'updated' if is_update else 'added'}: {doc_key}") + + async def batch_update_documents_async(self, updates: Dict[str, str]): + """Update multiple documents concurrently""" + if not self.index_ready: + raise RuntimeError("Index not ready. Call build_index_async() first.") + + logger.info(f"Batch updating {len(updates)} documents...") + + # Process all updates concurrently + update_tasks = [ + self.update_document_async(doc_key, content) + for doc_key, content in updates.items() + ] + + await asyncio.gather(*update_tasks) + logger.info(f"Batch update completed for {len(updates)} documents") + + async def remove_document_async(self, doc_key: str): + """Remove a document from the index""" + if not self.index_ready: + raise RuntimeError("Index not ready. Call build_index_async() first.") + + if doc_key not in self.doc_keys: + logger.warning(f"Document {doc_key} not found in index") + return + + logger.info(f"Removing document: {doc_key}") + + # Remove from documents dict + self.documents.pop(doc_key, None) + + # Remove from indexes + await self._remove_document_from_index(doc_key) + + logger.info(f"Document removed: {doc_key}") + + async def incremental_rebuild_async(self, similarity_threshold: float = 0.8): + """Smart incremental rebuild of the index""" + if not self.index_ready: + raise RuntimeError("Index not ready. Call build_index_async() first.") + + logger.info("Starting incremental rebuild...") + + # For now, we'll do a simple approach: recalculate IDF scores for all terms + # In a more sophisticated implementation, we could track term frequency changes + # and only recalculate when changes exceed the similarity threshold + + loop = asyncio.get_event_loop() + + # Recalculate term document frequencies + term_doc_freq = defaultdict(int) + for term, doc_set in self.inverted_index.items(): + term_doc_freq[term] = len(doc_set) + + # Recalculate IDF scores + all_terms = set(self.idf_scores.keys()) + self.idf_scores = await loop.run_in_executor( + self.executor, + self._compute_idf_scores, + all_terms, + term_doc_freq + ) + + logger.info("Incremental rebuild completed") + + async def get_document_stats(self, doc_key: str) -> Dict: + """Get statistics for a specific document""" + if not self.index_ready: + raise RuntimeError("Index not ready. Call build_index_async() first.") + + if doc_key not in self.doc_keys: + return {'error': f'Document {doc_key} not found'} + + term_counts = self.doc_term_counts[doc_key] + doc_length = self.doc_lengths[doc_key] + + return { + 'document_key': doc_key, + 'document_length': doc_length, + 'unique_terms': len(term_counts), + 'most_frequent_terms': term_counts.most_common(10), + 'tf_idf_top_terms': sorted([ + (term, self.tf_scores[doc_key][term] * self.idf_scores.get(term, 0)) + for term in term_counts.keys() + ], key=lambda x: x[1], reverse=True)[:10] + } + + async def search_async(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]: + """ + Async search with concurrent scoring of candidate documents + """ + if not self.index_ready: + raise RuntimeError("Index not ready. Call build_index_async() first.") + + # Tokenize query + if not hasattr(self, '_token_pattern'): + self._token_pattern = re.compile(r'\b\w+\b') + + query_terms = self._token_pattern.findall(query.lower()) + if not query_terms: + return [] + + # Get candidate documents using inverted index + candidate_docs = set() + query_term_counts = Counter(query_terms) + + for term in query_term_counts: + if term in self.inverted_index: + candidate_docs.update(self.inverted_index[term]) + + if not candidate_docs: + return [] + + # Score candidates in parallel if we have many + if len(candidate_docs) > 20: # Only parallelize if worth it + return await self._score_candidates_parallel(candidate_docs, query_term_counts, top_k) + else: + return await self._score_candidates_sequential(candidate_docs, query_term_counts, top_k) + + async def _score_candidates_parallel(self, candidate_docs: Set[str], query_term_counts: Counter, top_k: int) -> List[Tuple[str, float]]: + """Score candidates in parallel""" + loop = asyncio.get_event_loop() + + # Split candidates into chunks + candidates_list = list(candidate_docs) + chunk_size = max(1, len(candidates_list) // (self.max_workers or 4)) + chunks = [candidates_list[i:i + chunk_size] for i in range(0, len(candidates_list), chunk_size)] + + # Score each chunk in parallel + scoring_tasks = [ + loop.run_in_executor( + self.executor, + self._score_chunk, + chunk, + query_term_counts + ) + for chunk in chunks + ] + + chunk_scores = await asyncio.gather(*scoring_tasks) + + # Merge all scores + all_scores = {} + for scores in chunk_scores: + all_scores.update(scores) + + # Return top-k results + if len(all_scores) > top_k * 3: + import heapq + return heapq.nlargest(top_k, all_scores.items(), key=lambda x: x[1]) + else: + return sorted(all_scores.items(), key=lambda x: x[1], reverse=True)[:top_k] + + async def _score_candidates_sequential(self, candidate_docs: Set[str], query_term_counts: Counter, top_k: int) -> List[Tuple[str, float]]: + """Score candidates sequentially for small sets""" + loop = asyncio.get_event_loop() + scores = await loop.run_in_executor( + self.executor, + self._score_chunk, + list(candidate_docs), + query_term_counts + ) + + return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k] + + def _score_chunk(self, doc_chunk: List[str], query_term_counts: Counter) -> Dict[str, float]: + """Score a chunk of documents (runs in thread)""" + scores = {} + + for doc_key in doc_chunk: + doc_length = self.doc_lengths[doc_key] + doc_term_counts = self.doc_term_counts[doc_key] + + bm25_score = 0.0 + tfidf_score = 0.0 + + for term, query_count in query_term_counts.items(): + idf = self.idf_scores.get(term, 0) + if idf == 0: + continue + + tf = doc_term_counts.get(term, 0) + if tf == 0: + continue + + # BM25 calculation + numerator = tf * (self.k1 + 1) + denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.avg_doc_length)) + bm25_score += idf * (numerator / denominator) + + # TF-IDF calculation + tf_normalized = self.tf_scores[doc_key].get(term, 0) + tfidf_score += tf_normalized * idf * query_count + + # Combine scores + combined_score = 0.7 * bm25_score + 0.3 * tfidf_score + scores[doc_key] = combined_score + + return scores + + async def batch_search_async(self, queries: List[str], top_k: int = 10) -> List[List[Tuple[str, float]]]: + """ + Search multiple queries concurrently + """ + search_tasks = [self.search_async(query, top_k) for query in queries] + return await asyncio.gather(*search_tasks) + + async def search_exact_match_async(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]: + """ + Async exact substring matching + """ + loop = asyncio.get_event_loop() + + # Split documents for parallel processing + doc_items = list(self.documents.items()) + chunk_size = max(1, len(doc_items) // (self.max_workers or 4)) + chunks = [doc_items[i:i + chunk_size] for i in range(0, len(doc_items), chunk_size)] + + # Process chunks in parallel + match_tasks = [ + loop.run_in_executor( + self.executor, + self._exact_match_chunk, + chunk, + query.lower() + ) + for chunk in chunks + ] + + chunk_matches = await asyncio.gather(*match_tasks) + + # Merge results + all_matches = [] + for matches in chunk_matches: + all_matches.extend(matches) + + return sorted(all_matches, key=lambda x: x[1], reverse=True)[:top_k] + + def _exact_match_chunk(self, doc_chunk: List[Tuple[str, str]], query_lower: str) -> List[Tuple[str, float]]: + """Process exact matching for a chunk of documents""" + matches = [] + for doc_key, content in doc_chunk: + content_lower = content.lower() + if query_lower in content_lower: + count = content_lower.count(query_lower) + score = count / (len(content) + 1) + matches.append((doc_key, score)) + return matches + + async def save_index_async(self, filepath: str): + """Save pre-computed index to disk asynchronously""" + if not self.index_ready: + raise RuntimeError("Index not ready. Call build_index_async() first.") + + logger.info(f"Saving search index to {filepath}") + + loop = asyncio.get_event_loop() + await loop.run_in_executor( + self.executor, + self._save_index_sync, + filepath + ) + + logger.info("Index saved successfully") + + def _save_index_sync(self, filepath: str): + """Synchronous save operation (runs in thread)""" + with open(filepath, 'wb') as f: + pickle.dump({ + 'tokenized_docs': self.tokenized_docs, + 'doc_lengths': self.doc_lengths, + 'doc_term_counts': self.doc_term_counts, + 'idf_scores': self.idf_scores, + 'tf_scores': self.tf_scores, + 'doc_keys': self.doc_keys, + 'N': self.N, + 'avg_doc_length': self.avg_doc_length, + 'inverted_index': dict(self.inverted_index) + }, f) + + @classmethod + async def load_index_async(cls, filepath: str, documents: Dict[str, str] = None, max_workers: Optional[int] = None): + """Load pre-computed index from disk asynchronously""" + logger.info(f"Loading search index from {filepath}") + + instance = cls.__new__(cls) + instance.max_workers = max_workers + instance.executor = ThreadPoolExecutor(max_workers=max_workers) + + loop = asyncio.get_event_loop() + data = await loop.run_in_executor( + instance.executor, + instance._load_index_sync, + filepath + ) + + # Restore all pre-computed data + instance.tokenized_docs = data['tokenized_docs'] + instance.doc_lengths = data['doc_lengths'] + instance.doc_term_counts = data['doc_term_counts'] + instance.idf_scores = data['idf_scores'] + instance.tf_scores = data['tf_scores'] + instance.doc_keys = data['doc_keys'] + instance.N = data['N'] + instance.avg_doc_length = data['avg_doc_length'] + instance.inverted_index = defaultdict(set, data['inverted_index']) + + # BM25 parameters + instance.k1 = 1.5 + instance.b = 0.75 + + instance.documents = documents or {} + instance.index_ready = True + + logger.info(f"Index loaded: {instance.N} documents with {len(instance.idf_scores)} unique terms") + return instance + + def _load_index_sync(self, filepath: str) -> Dict: + """Synchronous load operation (runs in thread)""" + with open(filepath, 'rb') as f: + return pickle.load(f) + + def get_stats(self) -> Dict: + """Get index statistics""" + if not self.index_ready: + return {'status': 'Index not ready'} + + return { + 'total_documents': self.N, + 'total_unique_terms': len(self.idf_scores), + 'average_document_length': self.avg_doc_length, + 'max_workers': self.max_workers, + 'status': 'ready' + } + + def __del__(self): + """Clean up executor""" + if hasattr(self, 'executor'): + self.executor.shutdown(wait=False) + +# Convenience wrapper for synchronous usage +class FastCodeSearchIndex: + """Synchronous wrapper around AsyncFastCodeSearchIndex""" + + def __init__(self, documents: Dict[str, str], max_workers: Optional[int] = None): + self.async_index = AsyncFastCodeSearchIndex(documents, max_workers) + # Build index synchronously + asyncio.run(self.async_index.build_index_async()) + + def search(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]: + return asyncio.run(self.async_index.search_async(query, top_k)) + + def batch_search(self, queries: List[str], top_k: int = 10) -> List[List[Tuple[str, float]]]: + return asyncio.run(self.async_index.batch_search_async(queries, top_k)) + + def search_exact_match(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]: + return asyncio.run(self.async_index.search_exact_match_async(query, top_k)) + + def update_document(self, doc_key: str, new_content: str): + """Update or add a single document""" + return asyncio.run(self.async_index.update_document_async(doc_key, new_content)) + + def batch_update_documents(self, updates: Dict[str, str]): + """Update multiple documents concurrently""" + return asyncio.run(self.async_index.batch_update_documents_async(updates)) + + def remove_document(self, doc_key: str): + """Remove a document from the index""" + return asyncio.run(self.async_index.remove_document_async(doc_key)) + + def incremental_rebuild(self, similarity_threshold: float = 0.8): + """Smart incremental rebuild""" + return asyncio.run(self.async_index.incremental_rebuild_async(similarity_threshold)) + + def get_document_stats(self, doc_key: str) -> Dict: + """Get statistics for a specific document""" + return asyncio.run(self.async_index.get_document_stats(doc_key)) + + def save_index(self, filepath: str): + asyncio.run(self.async_index.save_index_async(filepath)) + + @classmethod + def load_index(cls, filepath: str, documents: Dict[str, str] = None, max_workers: Optional[int] = None): + instance = cls.__new__(cls) + instance.async_index = asyncio.run( + AsyncFastCodeSearchIndex.load_index_async(filepath, documents, max_workers) + ) + return instance + + def get_stats(self) -> Dict: + return self.async_index.get_stats() + +async def main(): + """Example usage with updates""" + documents = { + "examples.apply_patch.trim_to_patch_section": "function that trims patch sections and handles patch file modifications", + "codetide.parsers.python_parser.PythonParser": "class for parsing python files and extracting code structure", + "codetide.agents.tide.defaults.DEFAULT_AGENT_TIDE_LLM_CONFIG_PATH": "default configuration path for tide agent llm settings", + "examples.aicore_dashboard.od": "object detection dashboard example implementation" + } + + # Build initial index + logger.info("=== BUILDING INITIAL INDEX ===") + search_index = AsyncFastCodeSearchIndex(documents, max_workers=4) + await search_index.build_index_async() + + # Test initial search + logger.info("Initial search test:") + results = await search_index.search_async("python parser", top_k=2) + for doc_key, score in results: + logger.info(f" {score:.3f}: {doc_key}") + + # Update existing document + logger.info("\n=== UPDATING EXISTING DOCUMENT ===") + await search_index.update_document_async( + "codetide.parsers.python_parser.PythonParser", + "advanced class for parsing python files, extracting AST, and analyzing code structure with type hints" + ) + + # Add new document + logger.info("\n=== ADDING NEW DOCUMENT ===") + await search_index.update_document_async( + "codetide.search.fast_search.FastSearchEngine", + "high performance search engine using BM25 and TF-IDF for code retrieval" + ) + + # Test search after updates + logger.info("\nSearch after updates:") + results = await search_index.search_async("python parser", top_k=3) + for doc_key, score in results: + logger.info(f" {score:.3f}: {doc_key}") + + # Batch update multiple documents + logger.info("\n=== BATCH UPDATE ===") + updates = { + "examples.apply_patch.trim_to_patch_section": "enhanced function for trimming patch sections with better error handling", + "new.module.data_processor": "module for processing and transforming data with advanced algorithms", + "new.module.cache_manager": "efficient cache management system with LRU eviction policy" + } + + await search_index.batch_update_documents_async(updates) + + # Test batch search + logger.info("\nBatch search test:") + queries = ["python parser", "patch trim", "cache manager", "data processor"] + batch_results = await search_index.batch_search_async(queries, top_k=2) + + for query, results in zip(queries, batch_results): + logger.info(f"\nQuery: '{query}'") + for doc_key, score in results: + logger.info(f" {score:.3f}: {doc_key}") + + # Show final stats + logger.info("\n=== FINAL STATS ===") + stats = search_index.get_stats() + logger.info(f"Final index stats: {stats}") + + # Test document removal + logger.info("\n=== REMOVING DOCUMENT ===") + await search_index.remove_document_async("examples.aicore_dashboard.od") + + final_stats = search_index.get_stats() + logger.info(f"Stats after removal: {final_stats}") + + # Test incremental rebuild + logger.info("\n=== INCREMENTAL REBUILD ===") + await search_index.incremental_rebuild_async() + + # Save updated index + await search_index.save_index_async("updated_index.pkl") + logger.info("Updated index saved!") + + # # Sync wrapper usage example + # logger.info("\n=== SYNC WRAPPER EXAMPLE ===") + # sync_index = FastCodeSearchIndex({"test.doc": "test content"}) + # sync_index.update_document("test.doc", "updated test content") + # sync_index.update_document("new.doc", "brand new content") + + # results = sync_index.search("test content") + # logger.info("Sync wrapper results:") + # for doc_key, score in results: + # logger.info(f" {score:.3f}: {doc_key}") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/codetide/search/preprocessor.py b/codetide/search/preprocessor.py new file mode 100644 index 0000000..e147502 --- /dev/null +++ b/codetide/search/preprocessor.py @@ -0,0 +1,256 @@ + +from functools import lru_cache +from typing import List +import unicodedata +import re + + +class CodeQueryPreprocessor: + """ + Blazingly fast query preprocessor optimized for code search. + Handles camelCase, snake_case, kebab-case, stemming, and code-specific terms. + """ + + def __init__(self): + # Compile regex patterns once for maximum performance + self._camel_case_pattern = re.compile(r'([a-z])([A-Z])') + self._snake_kebab_pattern = re.compile(r'[_\-]+') + self._word_boundary_pattern = re.compile(r'\b\w+\b') + self._non_alphanumeric = re.compile(r'[^\w\s]') + self._multiple_spaces = re.compile(r'\s+') + self._number_pattern = re.compile(r'\d+') + + # Common code abbreviations and their expansions (cached for speed) + self._code_expansions = { + 'btn': 'button', + 'cfg': 'config configuration', + 'ctx': 'context', + 'db': 'database', + 'fn': 'function', + 'func': 'function', + 'impl': 'implementation implement', + 'mgr': 'manager', + 'obj': 'object', + 'param': 'parameter', + 'proc': 'process processor', + 'repo': 'repository', + 'req': 'request require', + 'res': 'response result', + 'str': 'string', + 'temp': 'temporary template', + 'util': 'utility utilities', + 'val': 'value', + 'var': 'variable', + 'auth': 'authentication authorize', + 'admin': 'administrator administration', + 'api': 'application programming interface', + 'ui': 'user interface', + 'url': 'uniform resource locator link', + 'http': 'hypertext transfer protocol', + 'json': 'javascript object notation', + 'xml': 'extensible markup language', + 'sql': 'structured query language', + 'css': 'cascading style sheets', + 'html': 'hypertext markup language', + 'js': 'javascript', + 'py': 'python', + 'ts': 'typescript', + 'async': 'asynchronous', + 'sync': 'synchronous', + } + + # Simple stemming rules for common programming terms + self._stem_rules = [ + (r'ies$', 'y'), # utilities -> utility + (r'ied$', 'y'), # applied -> apply + (r'ying$', 'y'), # applying -> apply + (r'ing$', ''), # processing -> process + (r'ed$', ''), # processed -> process + (r'er$', ''), # processor -> process + (r'est$', ''), # fastest -> fast + (r's$', ''), # functions -> function + ] + self._compiled_stem_rules = [(re.compile(pattern), replacement) + for pattern, replacement in self._stem_rules] + + # Stop words for code (less aggressive than natural language) + self._stop_words = { + 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', + 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', + 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', + 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', + 'we', 'they', 'me', 'him', 'her', 'us', 'them' + } + + @lru_cache(maxsize=1000) + def _expand_camel_case(self, word: str) -> str: + """Convert camelCase to space-separated words with caching""" + # Handle camelCase: getUserName -> get User Name + expanded = self._camel_case_pattern.sub(r'\1 \2', word) + return expanded.lower() + + @lru_cache(maxsize=1000) + def _expand_snake_kebab(self, word: str) -> str: + """Convert snake_case and kebab-case to space-separated with caching""" + # Handle snake_case and kebab-case: get_user_name -> get user name + return self._snake_kebab_pattern.sub(' ', word).lower() + + @lru_cache(maxsize=500) + def _simple_stem(self, word: str) -> str: + """Apply simple stemming rules with caching""" + if len(word) <= 3: # Don't stem very short words + return word + + for pattern, replacement in self._compiled_stem_rules: + if pattern.search(word): + stemmed = pattern.sub(replacement, word) + if len(stemmed) >= 2: # Don't create words that are too short + return stemmed + return word + + def _expand_abbreviations(self, words: List[str]) -> List[str]: + """Expand common code abbreviations""" + expanded = [] + for word in words: + if word in self._code_expansions: + expanded.extend(self._code_expansions[word].split()) + expanded.append(word) + return expanded + + def preprocess_query(self, query: str, + expand_case: bool = True, + expand_abbreviations: bool = True, + apply_stemming: bool = True, + remove_stop_words: bool = False, + min_word_length: int = 2) -> str: + """ + Preprocess a query for optimal code search performance. + + Args: + query: Raw user query + expand_case: Whether to expand camelCase, snake_case, kebab-case + expand_abbreviations: Whether to expand common code abbreviations + apply_stemming: Whether to apply simple stemming + remove_stop_words: Whether to remove stop words (usually False for code) + min_word_length: Minimum word length to keep + + Returns: + Preprocessed query string + """ + if not query or not query.strip(): + return "" + + # Normalize unicode characters + query = unicodedata.normalize('NFKD', query) + + # Remove excessive punctuation but keep some code-relevant chars + query = self._non_alphanumeric.sub(' ', query) + + # Extract words + words = self._word_boundary_pattern.findall(query.lower()) + + # Process each word + processed_words = [] + + for word in words: + if len(word) < min_word_length: + continue + + # Skip if it's just numbers (unless it's a version number context) + if self._number_pattern.fullmatch(word): + processed_words.append(word) # Keep numbers as they might be important + continue + + # Expand case conventions + if expand_case: + # Handle camelCase + if any(c.isupper() for c in word[1:]): # Has uppercase after first char + expanded = self._expand_camel_case(word) + processed_words.extend(expanded.split()) + + # Handle snake_case and kebab-case + if '_' in word or '-' in word: + expanded = self._expand_snake_kebab(word) + processed_words.extend(expanded.split()) + + # Add original word + processed_words.append(word) + + # Remove duplicates while preserving order + unique_words = [] + seen = set() + for word in processed_words: + if word not in seen and len(word) >= min_word_length: + unique_words.append(word) + seen.add(word) + + # Expand abbreviations + if expand_abbreviations: + unique_words = self._expand_abbreviations(unique_words) + + # Apply stemming + if apply_stemming: + unique_words = [self._simple_stem(word) for word in unique_words] + + # Remove stop words (usually not recommended for code search) + if remove_stop_words: + unique_words = [word for word in unique_words + if word not in self._stop_words] + + # Final cleanup and deduplication + final_words = [] + seen = set() + for word in unique_words: + if word and len(word) >= min_word_length and word not in seen: + final_words.append(word) + seen.add(word) + + return ' '.join(final_words) + + def generate_query_variations(self, query: str) -> List[str]: + """Generate multiple query variations for better search coverage""" + variations = [] + + # Original query + variations.append(query) + + # Preprocessed with different settings + variations.append(self.preprocess_query(query, + expand_case=True, + expand_abbreviations=True, + apply_stemming=False)) + + variations.append(self.preprocess_query(query, + expand_case=True, + expand_abbreviations=False, + apply_stemming=True)) + + variations.append(self.preprocess_query(query, + expand_case=False, + expand_abbreviations=True, + apply_stemming=True)) + + # Remove empty and duplicate variations + return list(filter(None, list(dict.fromkeys(variations)))) + +if __name__ == "__main__": + # Test the preprocessor + print("=== QUERY PREPROCESSOR DEMO ===") + preprocessor = CodeQueryPreprocessor() + + test_queries = [ + "getUserByEmail", + "find-user-by-email", + "API_Controller", + "db cfg", + "string helpers", + "camelCase to snake", + "Hi lets update the DataBaseManager!" + ] + + for query in test_queries: + processed = preprocessor.preprocess_query(query) + variations = preprocessor.generate_query_variations(query) + print(f"\nOriginal: {query}") + print(f"Processed: {processed}") + print(f"Variations: {variations}") \ No newline at end of file diff --git a/examples/smart_code_search.py b/examples/smart_code_search.py new file mode 100644 index 0000000..c68be8d --- /dev/null +++ b/examples/smart_code_search.py @@ -0,0 +1,74 @@ +from codetide.search.code_search import SmartCodeSearch +from codetide import CodeTide +import os + +FILE_TEMPLATE = """{FILENAME} + +{CONTENT} +""" + +async def main(): + tide = await CodeTide.from_path(os.getenv("CODETIDE_REPO_PATH")) + search = SmartCodeSearch( + documents={ + codefile.file_path: codefile.file_path + # FILE_TEMPLATE.format(CONTENT=codefile.raw, FILENAME=codefile.file_path) + for codefile in tide.codebase.root + }, + ) + await search.initialize_async() + search_queries = ["Add me a smooth transition effect to the expansor in ReasoningMessage.jsx"] + + for query in search_queries: + print(f"\n--- Searching for: '{query}' ---") + results = await search.search_smart(query, top_k=5) + + for doc_key, score in results: + print(f" {score:.3f}: {doc_key}") + + searchV1 = SmartCodeSearch( + documents={ + codefile.file_path: codefile.file_path + # FILE_TEMPLATE.format(CONTENT=codefile.raw, FILENAME=codefile.file_path) + for codefile in tide.codebase.root + } + ) + await searchV1.initialize_async() + search_queries = ["Add me a smooth transition effect to the expansor in ReasoningMessage.jsx"] + + for query in search_queries: + print(f"\n--- Searching for: '{query}' ---") + results = await searchV1.search_smart(query, top_k=5) + + for doc_key, score in results: + print(f" {score:.3f}: {doc_key}") + + nodes_dict = tide.codebase.compile_tree_nodes_dict() + nodes_dict = { + filepath: contents for filepath, elements in nodes_dict.items() + if (contents := "\n".join([filepath] + elements).strip()) + } + + searchV2 = SmartCodeSearch(documents=nodes_dict) + await searchV2.initialize_async() + search_queries = ["Add me a smooth transition effect to the expansor in ReasoningMessage.jsx"] + + for query in search_queries: + print(f"\n--- Searching for: '{query}' ---") + results = await searchV2.search_smart(query, top_k=5) + + for doc_key, score in results: + print(f" {score:.3f}: {doc_key}") + + tide.codebase._build_tree_dict(filter_paths=[doc_key for doc_key, score in results]) + print(tide.codebase.get_tree_view(include_modules=True)) + + +if __name__ == "__main__": + from dotenv import load_dotenv + import asyncio + + load_dotenv() + asyncio.run(main()) + +