Skip to content

Commit cda0629

Browse files
committed
fix(pyproject): run
1 parent 79d96ef commit cda0629

File tree

2 files changed

+58
-52
lines changed

2 files changed

+58
-52
lines changed

ai/smart_chunker.py

Lines changed: 57 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -12,63 +12,63 @@ class SmartChunker:
1212
Code-aware chunker that splits text based on language structure.
1313
Falls back to simple chunking for non-code or unknown languages.
1414
"""
15-
15+
1616
def __init__(self, chunk_size: int = 800, overlap: int = 100):
1717
self.chunk_size = chunk_size
1818
self.overlap = overlap
19-
19+
2020
def chunk(self, text: str, language: str = "text") -> List[str]:
2121
"""
2222
Chunk text based on language-specific rules.
23-
23+
2424
Args:
2525
text: Text content to chunk
2626
language: Programming language identifier
27-
27+
2828
Returns:
2929
List of text chunks
3030
"""
3131
if language in ["python", "javascript", "typescript", "java", "go", "rust", "c", "cpp"]:
3232
return self._chunk_code(text, language)
3333
else:
3434
return self._chunk_simple(text)
35-
35+
3636
def _chunk_code(self, text: str, language: str) -> List[str]:
3737
"""
3838
Smart chunking for code that respects structure.
3939
"""
4040
# Split into logical units (functions, classes, etc.)
4141
units = self._split_into_units(text, language)
42-
42+
4343
if not units:
4444
# Fallback to simple chunking if structure detection fails
4545
return self._chunk_simple(text)
46-
46+
4747
chunks = []
4848
current_chunk = []
4949
current_size = 0
50-
50+
5151
for unit_text, unit_type in units:
5252
unit_size = len(unit_text)
53-
53+
5454
# If single unit is larger than chunk_size, split it
5555
if unit_size > self.chunk_size:
5656
# Save current chunk if it has content
5757
if current_chunk:
5858
chunks.append("\n".join(current_chunk))
5959
current_chunk = []
6060
current_size = 0
61-
61+
6262
# Split large unit with simple chunking
6363
sub_chunks = self._chunk_simple(unit_text)
6464
chunks.extend(sub_chunks)
6565
continue
66-
66+
6767
# Check if adding this unit would exceed chunk_size
6868
if current_size + unit_size > self.chunk_size and current_chunk:
6969
# Save current chunk
7070
chunks.append("\n".join(current_chunk))
71-
71+
7272
# Start new chunk with overlap
7373
# Keep last unit for context
7474
if len(current_chunk) > 1:
@@ -82,13 +82,13 @@ def _chunk_code(self, text: str, language: str) -> List[str]:
8282
# Add to current chunk
8383
current_chunk.append(unit_text)
8484
current_size += unit_size
85-
85+
8686
# Add remaining chunk
8787
if current_chunk:
8888
chunks.append("\n".join(current_chunk))
89-
89+
9090
return chunks if chunks else [text]
91-
91+
9292
def _split_into_units(self, text: str, language: str) -> List[Tuple[str, str]]:
9393
"""
9494
Split code into logical units (functions, classes, etc.).
@@ -104,11 +104,11 @@ def _split_into_units(self, text: str, language: str) -> List[Tuple[str, str]]:
104104
return self._split_c_style(text)
105105
else:
106106
return []
107-
107+
108108
def _split_python(self, text: str) -> List[Tuple[str, str]]:
109109
"""
110110
Split Python code into classes and functions.
111-
111+
112112
Uses indentation-based parsing. Works well for most Python code
113113
but may have edge cases with complex indentation patterns.
114114
Falls back to simple chunking if parsing fails.
@@ -117,59 +117,64 @@ def _split_python(self, text: str) -> List[Tuple[str, str]]:
117117
lines = text.split("\n")
118118
current_unit = []
119119
current_type = None
120-
indent_stack = []
121-
120+
indent_stack = [] # only populated when a class/def starts
121+
122122
for i, line in enumerate(lines):
123123
stripped = line.lstrip()
124124
indent = len(line) - len(stripped)
125-
125+
126126
# Detect class or function definition
127127
if stripped.startswith("class ") or stripped.startswith("def "):
128128
# Save previous unit if exists
129129
if current_unit:
130130
units.append(("\n".join(current_unit), current_type or "code"))
131131
current_unit = []
132-
132+
133133
current_type = "class" if stripped.startswith("class ") else "function"
134134
current_unit = [line]
135135
indent_stack = [indent]
136136
elif current_unit:
137137
# Continue current unit
138138
current_unit.append(line)
139-
139+
140140
# Check if we're back to base indent (end of function/class)
141-
if stripped and not stripped.startswith("#") and indent <= indent_stack[0]:
141+
# Guard access to indent_stack: only compare indent if indent_stack is populated
142+
if stripped and not stripped.startswith("#") and indent_stack and indent <= indent_stack[0]:
142143
if i < len(lines) - 1: # Not last line
143144
# Check next line to see if it's a new definition
144145
next_stripped = lines[i + 1].lstrip()
145146
if next_stripped.startswith("class ") or next_stripped.startswith("def "):
146147
# End current unit
148+
# current_unit contains the line that dedented; we want to separate the trailing dedent line
149+
# The previous block is current_unit[:-1], remaining starts from current_unit[-1]
147150
units.append(("\n".join(current_unit[:-1]), current_type))
148-
current_unit = [line] # Start module-level code
151+
# Start module-level accumulation with the dedent line
152+
current_unit = [current_unit[-1]]
149153
current_type = "module"
154+
indent_stack = []
150155
else:
151156
# Module-level code
152157
if not current_unit:
153158
current_type = "module"
154159
current_unit.append(line)
155-
160+
156161
# Add remaining unit
157162
if current_unit:
158163
units.append(("\n".join(current_unit), current_type or "code"))
159-
164+
160165
return units
161-
166+
162167
def _split_javascript(self, text: str) -> List[Tuple[str, str]]:
163168
"""
164169
Split JavaScript/TypeScript code into functions and classes.
165-
170+
166171
Uses regex patterns to match function and class declarations.
167172
Works well for standard code patterns but may not handle all
168173
edge cases with nested structures. Falls back to brace-based
169174
splitting if regex matching doesn't find units.
170175
"""
171176
units = []
172-
177+
173178
# Regex patterns for JS/TS
174179
# Match function declarations, arrow functions, class declarations
175180
# Note: Non-greedy matching, works for most cases but not perfect for deeply nested code
@@ -178,52 +183,52 @@ def _split_javascript(self, text: str) -> List[Tuple[str, str]]:
178183
r'((?:export\s+)?const\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*{[\s\S]*?})',
179184
r'((?:export\s+)?class\s+\w+(?:\s+extends\s+\w+)?\s*{[\s\S]*?})',
180185
]
181-
186+
182187
# Try to match and extract units
183188
for pattern in patterns:
184189
matches = re.finditer(pattern, text)
185190
for match in matches:
186191
unit_text = match.group(1)
187192
unit_type = "function" if "function" in unit_text or "=>" in unit_text else "class"
188193
units.append((unit_text, unit_type))
189-
194+
190195
# If no matches, fall back to brace-based splitting
191196
if not units:
192197
units = self._split_by_braces(text)
193-
198+
194199
return units
195-
200+
196201
def _split_java(self, text: str) -> List[Tuple[str, str]]:
197202
"""Split Java code into classes and methods."""
198203
# Similar to JavaScript but with Java-specific patterns
199204
patterns = [
200205
r'((?:public|private|protected)?\s*(?:static)?\s*(?:class|interface|enum)\s+\w+[\s\S]*?{[\s\S]*?})',
201206
r'((?:public|private|protected)?\s*(?:static)?\s*(?:\w+\s+)?\w+\s*\([^)]*\)\s*(?:throws\s+\w+(?:,\s*\w+)*)?\s*{[\s\S]*?})',
202207
]
203-
208+
204209
units = []
205210
for pattern in patterns:
206211
matches = re.finditer(pattern, text)
207212
for match in matches:
208213
unit_text = match.group(1)
209214
unit_type = "class" if any(kw in unit_text for kw in ["class", "interface", "enum"]) else "method"
210215
units.append((unit_text, unit_type))
211-
216+
212217
if not units:
213218
units = self._split_by_braces(text)
214-
219+
215220
return units
216-
221+
217222
def _split_c_style(self, text: str) -> List[Tuple[str, str]]:
218223
"""Split C-style languages (Go, Rust, C, C++) into functions."""
219224
units = self._split_by_braces(text)
220225
return units if units else []
221-
226+
222227
def _split_by_braces(self, text: str) -> List[Tuple[str, str]]:
223228
"""
224229
Generic brace-based splitting for C-style languages.
225230
Finds balanced brace blocks.
226-
231+
227232
Note: This is a simple heuristic that doesn't handle braces
228233
inside strings, comments, or template literals. It works well
229234
for most code but may produce imperfect results in edge cases.
@@ -234,49 +239,49 @@ def _split_by_braces(self, text: str) -> List[Tuple[str, str]]:
234239
current_unit = []
235240
brace_count = 0
236241
in_block = False
237-
242+
238243
for line in lines:
239244
current_unit.append(line)
240-
245+
241246
# Count braces (simple heuristic)
242247
# Note: Doesn't handle strings/comments perfectly, but works well in practice
243248
brace_count += line.count("{") - line.count("}")
244-
249+
245250
if "{" in line and not in_block:
246251
in_block = True
247-
252+
248253
if in_block and brace_count == 0:
249254
# Block closed
250255
units.append(("\n".join(current_unit), "function"))
251256
current_unit = []
252257
in_block = False
253-
258+
254259
# Add remaining lines
255260
if current_unit:
256261
units.append(("\n".join(current_unit), "code"))
257-
262+
258263
return units
259-
264+
260265
def _chunk_simple(self, text: str) -> List[str]:
261266
"""
262267
Simple character-based chunking with overlap.
263268
Used as fallback or for non-code content.
264269
"""
265270
if not text:
266271
return []
267-
272+
268273
if len(text) <= self.chunk_size:
269274
return [text]
270-
275+
271276
chunks = []
272277
step = max(1, self.chunk_size - self.overlap)
273278
start = 0
274-
279+
275280
while start < len(text):
276281
end = min(start + self.chunk_size, len(text))
277282
chunks.append(text[start:end])
278283
start += step
279-
284+
280285
return chunks
281286

282287

@@ -287,13 +292,13 @@ def _chunk_simple(self, text: str) -> List[str]:
287292
def smart_chunk(text: str, language: str = "text", chunk_size: int = 800, overlap: int = 100) -> List[str]:
288293
"""
289294
Convenience function for smart chunking.
290-
295+
291296
Args:
292297
text: Text to chunk
293298
language: Programming language
294299
chunk_size: Maximum chunk size in characters
295300
overlap: Overlap between chunks in characters
296-
301+
297302
Returns:
298303
List of text chunks
299304
"""

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ dependencies = [
2121
[build-system]
2222
requires = ["setuptools>=42", "wheel"]
2323
build-backend = "setuptools.build_meta"
24+
2425
[tool.setuptools.packages.find]
2526
where = ["."]
2627
include = ["ai*", "db*", "services*", "endpoints*"]

0 commit comments

Comments
 (0)