Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 45 additions & 12 deletions src/ps_helper/extensions/metrics_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


class MetricsExtension:
def __init__(self, stats, schema=None, unique_field=None, max_buckets=30):
def __init__(self, stats, schema=None, unique_field=None, max_buckets=30, items_expected=None):
"""
Scrapy Metrics Extension.

Expand All @@ -39,14 +39,23 @@ def __init__(self, stats, schema=None, unique_field=None, max_buckets=30):
self.schema = schema
self.unique_field = unique_field

self.items_expected = items_expected

@classmethod
def from_crawler(cls, crawler):
schema = getattr(crawler.spidercls, "schema", None)
unique_field = getattr(crawler.spidercls, "unique_field", None)

max_buckets = crawler.settings.getint("METRICS_TIMELINE_BUCKETS", 30)

ext = cls(crawler.stats, schema=schema, unique_field=unique_field, max_buckets=max_buckets)
items_expected = getattr(crawler.spidercls, "ITEMS_EXPECTED", None)

ext = cls(
crawler.stats,
schema=schema,
unique_field=unique_field,
max_buckets=max_buckets,
items_expected=items_expected
)

crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
Expand Down Expand Up @@ -108,14 +117,39 @@ def spider_closed(self, spider, reason):
interval_size = max(1, math.ceil(total_minutes / self.max_buckets))

# Success rate
successful_requests = self.stats.get_value("downloader/response_count", 0)
total_requests = self.stats.get_value("downloader/request_count", 0)
items = self.stats.get_value("custom/items_scraped", 0)
pages = self.stats.get_value("custom/pages_processed", 0)
total_requests = self.stats.get_value("downloader/response_count", 0)
retries_total = self.stats.get_value("retry/count", 0)

adjusted_successful = max(successful_requests - retries_total, 0)
adjusted_total = max(total_requests, 1)

success_rate = (adjusted_successful / adjusted_total) * 100
status_200 = self.http_status_counter.get(200, 0)
http_success_rate = (status_200 / total_requests * 100) if total_requests > 0 else 0

# Efficiency
requests_per_item_obtained = total_requests / items if items > 0 else float('inf')

# Penalización por ineficiencia
if requests_per_item_obtained <= 3:
efficiency_factor = 1.0 # Sin penalización
elif requests_per_item_obtained <= 4:
efficiency_factor = 0.95 # 5% penalización
elif requests_per_item_obtained <= 5:
efficiency_factor = 0.90 # 10% penalización
elif requests_per_item_obtained <= 7:
efficiency_factor = 0.80 # 20% penalización
else:
efficiency_factor = 0.65 # 35% penalización (muy ineficiente)

if self.items_expected:
goal_achievement = (items / self.items_expected * 100) if self.items_expected > 0 else 0

success_rate = (
(goal_achievement * 0.7 + http_success_rate * 0.3) * efficiency_factor
)
success_rate = min(100, max(0, success_rate))
else:
success_rate = http_success_rate * efficiency_factor
success_rate = min(100, max(0, success_rate))

# Group timeline
aggregated = defaultdict(int)
Expand All @@ -134,9 +168,6 @@ def spider_closed(self, spider, reason):
)
]

items = self.stats.get_value("custom/items_scraped", 0)
pages = self.stats.get_value("custom/pages_processed", 0)

# Speed
items_per_min = items / (elapsed / 60) if elapsed > 0 else 0
pages_per_min = pages / (elapsed / 60) if elapsed > 0 else 0
Expand Down Expand Up @@ -174,6 +205,8 @@ def spider_closed(self, spider, reason):
"pages_per_minute": round(pages_per_min, 2),
"time_per_page_seconds": round(time_per_page, 2),
"success_rate": round(success_rate, 2),
"http_success_rate": round(http_success_rate, 2),
"goal_achievement": round(goal_achievement, 2) if self.items_expected else None,
"schema_coverage": {
"percentage": round(schema_coverage_percentage, 2),
"valid": self.valid_items,
Expand Down
37 changes: 32 additions & 5 deletions src/ps_helper/scripts/generate_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ def load_scrapy_stats(json_path):
"duration": format_duration(data.get("elapsed_time_seconds", 0)),
"items_per_minute": round(data.get("items_per_minute", 0), 1),
"pages_per_minute": round(data.get("pages_per_minute", 0), 2),
"http_success_rate": data.get("http_success_rate", 0),
"goal_achievement": data.get("goal_achievement", 0),
}

http_errors = data.get("http_errors", {})
Expand Down Expand Up @@ -253,14 +255,18 @@ def _generate_retry_reasons_html(data):
)

# Success rate
if scrapy_stats["success_rate"] >= 95:
if scrapy_stats["success_rate"] >= 90:
status_class = "success"
status_text = "Successful"
icon = "✅"
elif scrapy_stats["success_rate"] >= 80:
elif scrapy_stats["success_rate"] >= 70:
status_class = "warning"
status_text = "With Warnings"
icon = "⚠️"
elif scrapy_stats["success_rate"] >= 50:
status_class = "warning-orange"
status_text = "Below Target"
icon = "⚠️"
else:
status_class = "error"
status_text = "Critical Error"
Expand Down Expand Up @@ -320,7 +326,18 @@ def _generate_retry_reasons_html(data):
labels=df_errors["Error"],
values=df_errors["Count"],
marker=dict(
colors=["#FF5733", "#F8623D", "#C67448", "#838E56", "#00BF71"][
colors=[
"#FF5733", # Naranja rojo (original)
"#FF6B3D", # Naranja brillante
"#FF8047", # Naranja medio
"#FF9551", # Naranja claro
"#FFAA5C", # Naranja amarillento
"#D4B85E", # Amarillo verdoso
"#A8C560", # Lima
"#7CB862", # Verde lima
"#50AA64", # Verde medio
"#00BF71" # Verde esmeralda (original)
][
: len(df_errors)
]
),
Expand Down Expand Up @@ -562,6 +579,11 @@ def _generate_retry_reasons_html(data):
border-color: #F8623D;
}}

.status-banner.warning-orange {{
background: linear-gradient(135deg, #fff4e6 0%, #ffecd1 100%);
border-color: #FF8047;
}}

.status-banner.error {{
background: linear-gradient(135deg, #ffe8e6 0%, #ffd6d1 100%);
border-color: #FF5733;
Expand Down Expand Up @@ -594,6 +616,7 @@ def _generate_retry_reasons_html(data):

.status-text p.success {{ color: #059669; }}
.status-text p.warning {{ color: #d97706; }}
.status-text p.warning-orange {{ color: #ea580c; }}
.status-text p.error {{ color: #dc2626; }}

.status-metrics {{
Expand Down Expand Up @@ -822,8 +845,12 @@ def _generate_retry_reasons_html(data):
<div class="metric-label">Items Scraped</div>
</div>
<div class="metric-item">
<div class="metric-value">{scrapy_stats['success_rate']}%</div>
<div class="metric-label">Success Rate</div>
<div class="metric-value">{scrapy_stats['http_success_rate']}%</div>
<div class="metric-label">Http Success Rate</div>
</div>
<div class="metric-item">
<div class="metric-value">{scrapy_stats['goal_achievement']}%</div>
<div class="metric-label">Goal Achievement</div>
</div>
<div class="metric-item">
<div class="metric-value">{scrapy_stats['duration']}</div>
Expand Down