Skip to content

Commit d19ed3c

Browse files
CopilotMte90
andcommitted
Add EmbeddingClient wrapper with detailed logging and retries
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent 6950730 commit d19ed3c

File tree

2 files changed

+227
-0
lines changed

2 files changed

+227
-0
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# ai/analyzer_embedding_usage_example.py
2+
import logging
3+
from ai.embedding_client import EmbeddingClient
4+
5+
logger = logging.getLogger("ai.analyzer")
6+
7+
# create client (will pick up env vars)
8+
client = EmbeddingClient()
9+
10+
def process_file_and_embed(file_path: str, chunks: list[str]):
11+
logger.info("Start embedding file", extra={"file": file_path, "num_chunks": len(chunks)})
12+
results = client.embed_multiple(chunks, file_path=file_path)
13+
# Inspect results for None embeddings and act accordingly
14+
for r in results:
15+
if r.get("embedding") is None:
16+
logger.warning("Chunk embedding failed", extra={"file": file_path, "chunk_index": r["chunk_index"], "error": r.get("error")})
17+
else:
18+
# continue with storing the embedding
19+
pass
20+
return results

ai/embedding_client.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# ai/embedding_client.py
2+
import os
3+
import time
4+
import uuid
5+
import json
6+
import logging
7+
import traceback
8+
from typing import List, Optional, Dict, Any
9+
10+
import requests
11+
12+
logger = logging.getLogger("ai.analyzer.embedding")
13+
14+
# Configurable via environment
15+
EMBEDDING_API_URL = os.getenv("PICOCODE_EMBEDDING_URL", "https://example.com/v1/embeddings")
16+
EMBEDDING_API_KEY = os.getenv("PICOCODE_EMBEDDING_API_KEY", "")
17+
DEFAULT_TIMEOUT = float(os.getenv("PICOCODE_EMBEDDING_TIMEOUT", "30")) # seconds per request
18+
MAX_RETRIES = int(os.getenv("PICOCODE_EMBEDDING_RETRIES", "2"))
19+
BACKOFF_FACTOR = float(os.getenv("PICOCODE_EMBEDDING_BACKOFF", "1.5"))
20+
MODEL_NAME = os.getenv("PICOCODE_EMBEDDING_MODEL", "text-embedding-3-small")
21+
22+
# Optionally enable requests debug logging by setting PICOCODE_HTTP_DEBUG=true
23+
if os.getenv("PICOCODE_HTTP_DEBUG", "").lower() in ("1", "true", "yes"):
24+
logging.getLogger("requests").setLevel(logging.DEBUG)
25+
logging.getLogger("urllib3").setLevel(logging.DEBUG)
26+
27+
28+
class EmbeddingError(Exception):
29+
pass
30+
31+
32+
class EmbeddingClient:
33+
def __init__(self,
34+
api_url: str = EMBEDDING_API_URL,
35+
api_key: str = EMBEDDING_API_KEY,
36+
model: str = MODEL_NAME,
37+
timeout: float = DEFAULT_TIMEOUT,
38+
max_retries: int = MAX_RETRIES,
39+
backoff: float = BACKOFF_FACTOR):
40+
self.api_url = api_url
41+
self.api_key = api_key
42+
self.model = model
43+
self.timeout = timeout
44+
self.max_retries = max_retries
45+
self.backoff = backoff
46+
self.session = requests.Session()
47+
if api_key:
48+
self.session.headers.update({"Authorization": f"Bearer {api_key}"})
49+
self.session.headers.update({"Content-Type": "application/json"})
50+
51+
def _log_request_start(self, request_id: str, file_path: str, chunk_index: int, chunk_len: int):
52+
logger.debug(
53+
"Embedding request START",
54+
extra={
55+
"request_id": request_id,
56+
"file": file_path,
57+
"chunk_index": chunk_index,
58+
"chunk_length": chunk_len,
59+
"model": self.model,
60+
"api_url": self.api_url,
61+
"timeout": self.timeout,
62+
},
63+
)
64+
65+
def _log_request_end(self, request_id: str, elapsed: float, status: Optional[int], response_body_preview: str):
66+
logger.debug(
67+
"Embedding request END",
68+
extra={
69+
"request_id": request_id,
70+
"elapsed_s": elapsed,
71+
"status": status,
72+
"response_preview": response_body_preview,
73+
},
74+
)
75+
76+
def embed_text(self, text: str, file_path: str = "<unknown>", chunk_index: int = 0) -> List[float]:
77+
"""
78+
Embed a single chunk of text. Returns the embedding vector.
79+
Raises EmbeddingError on failure.
80+
"""
81+
request_id = str(uuid.uuid4())
82+
chunk_len = len(text)
83+
self._log_request_start(request_id, file_path, chunk_index, chunk_len)
84+
85+
payload = {
86+
"model": self.model,
87+
"input": text,
88+
}
89+
90+
attempt = 0
91+
while True:
92+
attempt += 1
93+
start = time.perf_counter()
94+
try:
95+
resp = self.session.post(
96+
self.api_url,
97+
data=json.dumps(payload),
98+
timeout=self.timeout,
99+
)
100+
elapsed = time.perf_counter() - start
101+
102+
# Try to parse JSON safely
103+
try:
104+
resp_json = resp.json()
105+
except Exception:
106+
resp_json = None
107+
108+
preview = ""
109+
if resp_json is not None:
110+
preview = json.dumps(resp_json)[:1000]
111+
else:
112+
preview = (resp.text or "")[:1000]
113+
114+
self._log_request_end(request_id, elapsed, resp.status_code, preview)
115+
116+
if resp.status_code >= 200 and resp.status_code < 300:
117+
# expected format: {"data": [{"embedding": [...]}], ...}
118+
if not resp_json:
119+
raise EmbeddingError(f"Empty JSON response (status={resp.status_code})")
120+
try:
121+
# tolerant extraction
122+
data = resp_json.get("data") if isinstance(resp_json, dict) else None
123+
if data and isinstance(data, list) and len(data) > 0:
124+
emb = data[0].get("embedding")
125+
if emb and isinstance(emb, list):
126+
logger.info(
127+
"Embedding succeeded",
128+
extra={"request_id": request_id, "file": file_path, "chunk_index": chunk_index},
129+
)
130+
return emb
131+
# Fallback: maybe top-level "embedding" key
132+
if isinstance(resp_json, dict) and "embedding" in resp_json:
133+
emb = resp_json["embedding"]
134+
if isinstance(emb, list):
135+
return emb
136+
raise EmbeddingError(f"Unexpected embedding response shape: {resp_json}")
137+
except KeyError as e:
138+
raise EmbeddingError(f"Missing keys in embedding response: {e}")
139+
else:
140+
# Non-2xx
141+
logger.warning(
142+
"Embedding API returned non-2xx",
143+
extra={
144+
"request_id": request_id,
145+
"status_code": resp.status_code,
146+
"file": file_path,
147+
"chunk_index": chunk_index,
148+
"attempt": attempt,
149+
"body_preview": preview,
150+
},
151+
)
152+
# fall through to retry logic
153+
err_msg = f"Status {resp.status_code}: {preview}"
154+
155+
except requests.Timeout as e:
156+
elapsed = time.perf_counter() - start
157+
err_msg = f"Timeout after {elapsed:.2f}s: {e}"
158+
logger.error("Embedding API Timeout", extra={"request_id": request_id, "error": str(e)})
159+
except requests.RequestException as e:
160+
elapsed = time.perf_counter() - start
161+
err_msg = f"RequestException after {elapsed:.2f}s: {e}\n{traceback.format_exc()}"
162+
logger.error("Embedding request exception", extra={"request_id": request_id, "error": err_msg})
163+
except Exception as e:
164+
elapsed = time.perf_counter() - start
165+
err_msg = f"Unexpected error after {elapsed:.2f}s: {e}\n{traceback.format_exc()}"
166+
logger.exception("Unexpected embedding exception", extra={"request_id": request_id})
167+
168+
# Retry logic
169+
if attempt > self.max_retries:
170+
logger.error(
171+
"Max retries exceeded for embedding request",
172+
extra={"request_id": request_id, "file": file_path, "chunk_index": chunk_index, "attempts": attempt},
173+
)
174+
raise EmbeddingError(f"Failed to get embedding after {attempt} attempts. Last error: {err_msg}")
175+
176+
# Backoff and retry
177+
sleep_for = self.backoff * (2 ** (attempt - 1))
178+
logger.info(
179+
"Retrying embedding request",
180+
extra={
181+
"request_id": request_id,
182+
"file": file_path,
183+
"chunk_index": chunk_index,
184+
"attempt": attempt,
185+
"sleep_s": sleep_for,
186+
},
187+
)
188+
time.sleep(sleep_for)
189+
190+
def embed_multiple(self, chunks: List[str], file_path: str = "<unknown>") -> List[Dict[str, Any]]:
191+
"""
192+
Embed a list of text chunks. Returns list of dicts: {"chunk_index": i, "embedding": [...]}.
193+
This method logs progress and errors for each chunk.
194+
"""
195+
results = []
196+
for i, chunk in enumerate(chunks):
197+
try:
198+
emb = self.embed_text(chunk, file_path=file_path, chunk_index=i)
199+
results.append({"chunk_index": i, "embedding": emb})
200+
except EmbeddingError as e:
201+
logger.error(
202+
"Failed to embed chunk",
203+
extra={"file": file_path, "chunk_index": i, "error": str(e)},
204+
)
205+
# append a failure marker or skip depending on desired behavior
206+
results.append({"chunk_index": i, "embedding": None, "error": str(e)})
207+
return results

0 commit comments

Comments
 (0)