@@ -12,63 +12,63 @@ class SmartChunker:
1212 Code-aware chunker that splits text based on language structure.
1313 Falls back to simple chunking for non-code or unknown languages.
1414 """
15-
15+
1616 def __init__ (self , chunk_size : int = 800 , overlap : int = 100 ):
1717 self .chunk_size = chunk_size
1818 self .overlap = overlap
19-
19+
2020 def chunk (self , text : str , language : str = "text" ) -> List [str ]:
2121 """
2222 Chunk text based on language-specific rules.
23-
23+
2424 Args:
2525 text: Text content to chunk
2626 language: Programming language identifier
27-
27+
2828 Returns:
2929 List of text chunks
3030 """
3131 if language in ["python" , "javascript" , "typescript" , "java" , "go" , "rust" , "c" , "cpp" ]:
3232 return self ._chunk_code (text , language )
3333 else :
3434 return self ._chunk_simple (text )
35-
35+
3636 def _chunk_code (self , text : str , language : str ) -> List [str ]:
3737 """
3838 Smart chunking for code that respects structure.
3939 """
4040 # Split into logical units (functions, classes, etc.)
4141 units = self ._split_into_units (text , language )
42-
42+
4343 if not units :
4444 # Fallback to simple chunking if structure detection fails
4545 return self ._chunk_simple (text )
46-
46+
4747 chunks = []
4848 current_chunk = []
4949 current_size = 0
50-
50+
5151 for unit_text , unit_type in units :
5252 unit_size = len (unit_text )
53-
53+
5454 # If single unit is larger than chunk_size, split it
5555 if unit_size > self .chunk_size :
5656 # Save current chunk if it has content
5757 if current_chunk :
5858 chunks .append ("\n " .join (current_chunk ))
5959 current_chunk = []
6060 current_size = 0
61-
61+
6262 # Split large unit with simple chunking
6363 sub_chunks = self ._chunk_simple (unit_text )
6464 chunks .extend (sub_chunks )
6565 continue
66-
66+
6767 # Check if adding this unit would exceed chunk_size
6868 if current_size + unit_size > self .chunk_size and current_chunk :
6969 # Save current chunk
7070 chunks .append ("\n " .join (current_chunk ))
71-
71+
7272 # Start new chunk with overlap
7373 # Keep last unit for context
7474 if len (current_chunk ) > 1 :
@@ -82,13 +82,13 @@ def _chunk_code(self, text: str, language: str) -> List[str]:
8282 # Add to current chunk
8383 current_chunk .append (unit_text )
8484 current_size += unit_size
85-
85+
8686 # Add remaining chunk
8787 if current_chunk :
8888 chunks .append ("\n " .join (current_chunk ))
89-
89+
9090 return chunks if chunks else [text ]
91-
91+
9292 def _split_into_units (self , text : str , language : str ) -> List [Tuple [str , str ]]:
9393 """
9494 Split code into logical units (functions, classes, etc.).
@@ -104,11 +104,11 @@ def _split_into_units(self, text: str, language: str) -> List[Tuple[str, str]]:
104104 return self ._split_c_style (text )
105105 else :
106106 return []
107-
107+
108108 def _split_python (self , text : str ) -> List [Tuple [str , str ]]:
109109 """
110110 Split Python code into classes and functions.
111-
111+
112112 Uses indentation-based parsing. Works well for most Python code
113113 but may have edge cases with complex indentation patterns.
114114 Falls back to simple chunking if parsing fails.
@@ -117,59 +117,64 @@ def _split_python(self, text: str) -> List[Tuple[str, str]]:
117117 lines = text .split ("\n " )
118118 current_unit = []
119119 current_type = None
120- indent_stack = []
121-
120+ indent_stack = [] # only populated when a class/def starts
121+
122122 for i , line in enumerate (lines ):
123123 stripped = line .lstrip ()
124124 indent = len (line ) - len (stripped )
125-
125+
126126 # Detect class or function definition
127127 if stripped .startswith ("class " ) or stripped .startswith ("def " ):
128128 # Save previous unit if exists
129129 if current_unit :
130130 units .append (("\n " .join (current_unit ), current_type or "code" ))
131131 current_unit = []
132-
132+
133133 current_type = "class" if stripped .startswith ("class " ) else "function"
134134 current_unit = [line ]
135135 indent_stack = [indent ]
136136 elif current_unit :
137137 # Continue current unit
138138 current_unit .append (line )
139-
139+
140140 # Check if we're back to base indent (end of function/class)
141- if stripped and not stripped .startswith ("#" ) and indent <= indent_stack [0 ]:
141+ # Guard access to indent_stack: only compare indent if indent_stack is populated
142+ if stripped and not stripped .startswith ("#" ) and indent_stack and indent <= indent_stack [0 ]:
142143 if i < len (lines ) - 1 : # Not last line
143144 # Check next line to see if it's a new definition
144145 next_stripped = lines [i + 1 ].lstrip ()
145146 if next_stripped .startswith ("class " ) or next_stripped .startswith ("def " ):
146147 # End current unit
148+ # current_unit contains the line that dedented; we want to separate the trailing dedent line
149+ # The previous block is current_unit[:-1], remaining starts from current_unit[-1]
147150 units .append (("\n " .join (current_unit [:- 1 ]), current_type ))
148- current_unit = [line ] # Start module-level code
151+ # Start module-level accumulation with the dedent line
152+ current_unit = [current_unit [- 1 ]]
149153 current_type = "module"
154+ indent_stack = []
150155 else :
151156 # Module-level code
152157 if not current_unit :
153158 current_type = "module"
154159 current_unit .append (line )
155-
160+
156161 # Add remaining unit
157162 if current_unit :
158163 units .append (("\n " .join (current_unit ), current_type or "code" ))
159-
164+
160165 return units
161-
166+
162167 def _split_javascript (self , text : str ) -> List [Tuple [str , str ]]:
163168 """
164169 Split JavaScript/TypeScript code into functions and classes.
165-
170+
166171 Uses regex patterns to match function and class declarations.
167172 Works well for standard code patterns but may not handle all
168173 edge cases with nested structures. Falls back to brace-based
169174 splitting if regex matching doesn't find units.
170175 """
171176 units = []
172-
177+
173178 # Regex patterns for JS/TS
174179 # Match function declarations, arrow functions, class declarations
175180 # Note: Non-greedy matching, works for most cases but not perfect for deeply nested code
@@ -178,52 +183,52 @@ def _split_javascript(self, text: str) -> List[Tuple[str, str]]:
178183 r'((?:export\s+)?const\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*{[\s\S]*?})' ,
179184 r'((?:export\s+)?class\s+\w+(?:\s+extends\s+\w+)?\s*{[\s\S]*?})' ,
180185 ]
181-
186+
182187 # Try to match and extract units
183188 for pattern in patterns :
184189 matches = re .finditer (pattern , text )
185190 for match in matches :
186191 unit_text = match .group (1 )
187192 unit_type = "function" if "function" in unit_text or "=>" in unit_text else "class"
188193 units .append ((unit_text , unit_type ))
189-
194+
190195 # If no matches, fall back to brace-based splitting
191196 if not units :
192197 units = self ._split_by_braces (text )
193-
198+
194199 return units
195-
200+
196201 def _split_java (self , text : str ) -> List [Tuple [str , str ]]:
197202 """Split Java code into classes and methods."""
198203 # Similar to JavaScript but with Java-specific patterns
199204 patterns = [
200205 r'((?:public|private|protected)?\s*(?:static)?\s*(?:class|interface|enum)\s+\w+[\s\S]*?{[\s\S]*?})' ,
201206 r'((?:public|private|protected)?\s*(?:static)?\s*(?:\w+\s+)?\w+\s*\([^)]*\)\s*(?:throws\s+\w+(?:,\s*\w+)*)?\s*{[\s\S]*?})' ,
202207 ]
203-
208+
204209 units = []
205210 for pattern in patterns :
206211 matches = re .finditer (pattern , text )
207212 for match in matches :
208213 unit_text = match .group (1 )
209214 unit_type = "class" if any (kw in unit_text for kw in ["class" , "interface" , "enum" ]) else "method"
210215 units .append ((unit_text , unit_type ))
211-
216+
212217 if not units :
213218 units = self ._split_by_braces (text )
214-
219+
215220 return units
216-
221+
217222 def _split_c_style (self , text : str ) -> List [Tuple [str , str ]]:
218223 """Split C-style languages (Go, Rust, C, C++) into functions."""
219224 units = self ._split_by_braces (text )
220225 return units if units else []
221-
226+
222227 def _split_by_braces (self , text : str ) -> List [Tuple [str , str ]]:
223228 """
224229 Generic brace-based splitting for C-style languages.
225230 Finds balanced brace blocks.
226-
231+
227232 Note: This is a simple heuristic that doesn't handle braces
228233 inside strings, comments, or template literals. It works well
229234 for most code but may produce imperfect results in edge cases.
@@ -234,49 +239,49 @@ def _split_by_braces(self, text: str) -> List[Tuple[str, str]]:
234239 current_unit = []
235240 brace_count = 0
236241 in_block = False
237-
242+
238243 for line in lines :
239244 current_unit .append (line )
240-
245+
241246 # Count braces (simple heuristic)
242247 # Note: Doesn't handle strings/comments perfectly, but works well in practice
243248 brace_count += line .count ("{" ) - line .count ("}" )
244-
249+
245250 if "{" in line and not in_block :
246251 in_block = True
247-
252+
248253 if in_block and brace_count == 0 :
249254 # Block closed
250255 units .append (("\n " .join (current_unit ), "function" ))
251256 current_unit = []
252257 in_block = False
253-
258+
254259 # Add remaining lines
255260 if current_unit :
256261 units .append (("\n " .join (current_unit ), "code" ))
257-
262+
258263 return units
259-
264+
260265 def _chunk_simple (self , text : str ) -> List [str ]:
261266 """
262267 Simple character-based chunking with overlap.
263268 Used as fallback or for non-code content.
264269 """
265270 if not text :
266271 return []
267-
272+
268273 if len (text ) <= self .chunk_size :
269274 return [text ]
270-
275+
271276 chunks = []
272277 step = max (1 , self .chunk_size - self .overlap )
273278 start = 0
274-
279+
275280 while start < len (text ):
276281 end = min (start + self .chunk_size , len (text ))
277282 chunks .append (text [start :end ])
278283 start += step
279-
284+
280285 return chunks
281286
282287
@@ -287,13 +292,13 @@ def _chunk_simple(self, text: str) -> List[str]:
287292def smart_chunk (text : str , language : str = "text" , chunk_size : int = 800 , overlap : int = 100 ) -> List [str ]:
288293 """
289294 Convenience function for smart chunking.
290-
295+
291296 Args:
292297 text: Text to chunk
293298 language: Programming language
294299 chunk_size: Maximum chunk size in characters
295300 overlap: Overlap between chunks in characters
296-
301+
297302 Returns:
298303 List of text chunks
299304 """
0 commit comments