diff --git a/src/ps_helper/extensions/metrics_extension.py b/src/ps_helper/extensions/metrics_extension.py index e6f7426..7235c00 100644 --- a/src/ps_helper/extensions/metrics_extension.py +++ b/src/ps_helper/extensions/metrics_extension.py @@ -12,7 +12,7 @@ class MetricsExtension: - def __init__(self, stats, schema=None, unique_field=None, max_buckets=30): + def __init__(self, stats, schema=None, unique_field=None, max_buckets=30, items_expected=None): """ Scrapy Metrics Extension. @@ -39,14 +39,23 @@ def __init__(self, stats, schema=None, unique_field=None, max_buckets=30): self.schema = schema self.unique_field = unique_field + self.items_expected = items_expected + @classmethod def from_crawler(cls, crawler): schema = getattr(crawler.spidercls, "schema", None) unique_field = getattr(crawler.spidercls, "unique_field", None) max_buckets = crawler.settings.getint("METRICS_TIMELINE_BUCKETS", 30) - - ext = cls(crawler.stats, schema=schema, unique_field=unique_field, max_buckets=max_buckets) + items_expected = getattr(crawler.spidercls, "ITEMS_EXPECTED", None) + + ext = cls( + crawler.stats, + schema=schema, + unique_field=unique_field, + max_buckets=max_buckets, + items_expected=items_expected + ) crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) @@ -108,14 +117,39 @@ def spider_closed(self, spider, reason): interval_size = max(1, math.ceil(total_minutes / self.max_buckets)) # Success rate - successful_requests = self.stats.get_value("downloader/response_count", 0) - total_requests = self.stats.get_value("downloader/request_count", 0) + items = self.stats.get_value("custom/items_scraped", 0) + pages = self.stats.get_value("custom/pages_processed", 0) + total_requests = self.stats.get_value("downloader/response_count", 0) retries_total = self.stats.get_value("retry/count", 0) - adjusted_successful = max(successful_requests - retries_total, 0) - adjusted_total = max(total_requests, 1) - - success_rate = (adjusted_successful / adjusted_total) * 100 + status_200 = self.http_status_counter.get(200, 0) + http_success_rate = (status_200 / total_requests * 100) if total_requests > 0 else 0 + + # Efficiency + requests_per_item_obtained = total_requests / items if items > 0 else float('inf') + + # Penalización por ineficiencia + if requests_per_item_obtained <= 3: + efficiency_factor = 1.0 # Sin penalización + elif requests_per_item_obtained <= 4: + efficiency_factor = 0.95 # 5% penalización + elif requests_per_item_obtained <= 5: + efficiency_factor = 0.90 # 10% penalización + elif requests_per_item_obtained <= 7: + efficiency_factor = 0.80 # 20% penalización + else: + efficiency_factor = 0.65 # 35% penalización (muy ineficiente) + + if self.items_expected: + goal_achievement = (items / self.items_expected * 100) if self.items_expected > 0 else 0 + + success_rate = ( + (goal_achievement * 0.7 + http_success_rate * 0.3) * efficiency_factor + ) + success_rate = min(100, max(0, success_rate)) + else: + success_rate = http_success_rate * efficiency_factor + success_rate = min(100, max(0, success_rate)) # Group timeline aggregated = defaultdict(int) @@ -134,9 +168,6 @@ def spider_closed(self, spider, reason): ) ] - items = self.stats.get_value("custom/items_scraped", 0) - pages = self.stats.get_value("custom/pages_processed", 0) - # Speed items_per_min = items / (elapsed / 60) if elapsed > 0 else 0 pages_per_min = pages / (elapsed / 60) if elapsed > 0 else 0 @@ -174,6 +205,8 @@ def spider_closed(self, spider, reason): "pages_per_minute": round(pages_per_min, 2), "time_per_page_seconds": round(time_per_page, 2), "success_rate": round(success_rate, 2), + "http_success_rate": round(http_success_rate, 2), + "goal_achievement": round(goal_achievement, 2) if self.items_expected else None, "schema_coverage": { "percentage": round(schema_coverage_percentage, 2), "valid": self.valid_items, diff --git a/src/ps_helper/scripts/generate_report.py b/src/ps_helper/scripts/generate_report.py index 9157ef7..c4d14a3 100644 --- a/src/ps_helper/scripts/generate_report.py +++ b/src/ps_helper/scripts/generate_report.py @@ -106,6 +106,8 @@ def load_scrapy_stats(json_path): "duration": format_duration(data.get("elapsed_time_seconds", 0)), "items_per_minute": round(data.get("items_per_minute", 0), 1), "pages_per_minute": round(data.get("pages_per_minute", 0), 2), + "http_success_rate": data.get("http_success_rate", 0), + "goal_achievement": data.get("goal_achievement", 0), } http_errors = data.get("http_errors", {}) @@ -253,14 +255,18 @@ def _generate_retry_reasons_html(data): ) # Success rate - if scrapy_stats["success_rate"] >= 95: + if scrapy_stats["success_rate"] >= 90: status_class = "success" status_text = "Successful" icon = "✅" - elif scrapy_stats["success_rate"] >= 80: + elif scrapy_stats["success_rate"] >= 70: status_class = "warning" status_text = "With Warnings" icon = "⚠️" + elif scrapy_stats["success_rate"] >= 50: + status_class = "warning-orange" + status_text = "Below Target" + icon = "⚠️" else: status_class = "error" status_text = "Critical Error" @@ -320,7 +326,18 @@ def _generate_retry_reasons_html(data): labels=df_errors["Error"], values=df_errors["Count"], marker=dict( - colors=["#FF5733", "#F8623D", "#C67448", "#838E56", "#00BF71"][ + colors=[ + "#FF5733", # Naranja rojo (original) + "#FF6B3D", # Naranja brillante + "#FF8047", # Naranja medio + "#FF9551", # Naranja claro + "#FFAA5C", # Naranja amarillento + "#D4B85E", # Amarillo verdoso + "#A8C560", # Lima + "#7CB862", # Verde lima + "#50AA64", # Verde medio + "#00BF71" # Verde esmeralda (original) + ][ : len(df_errors) ] ), @@ -562,6 +579,11 @@ def _generate_retry_reasons_html(data): border-color: #F8623D; }} + .status-banner.warning-orange {{ + background: linear-gradient(135deg, #fff4e6 0%, #ffecd1 100%); + border-color: #FF8047; + }} + .status-banner.error {{ background: linear-gradient(135deg, #ffe8e6 0%, #ffd6d1 100%); border-color: #FF5733; @@ -594,6 +616,7 @@ def _generate_retry_reasons_html(data): .status-text p.success {{ color: #059669; }} .status-text p.warning {{ color: #d97706; }} + .status-text p.warning-orange {{ color: #ea580c; }} .status-text p.error {{ color: #dc2626; }} .status-metrics {{ @@ -822,8 +845,12 @@ def _generate_retry_reasons_html(data):
Items Scraped
-
{scrapy_stats['success_rate']}%
-
Success Rate
+
{scrapy_stats['http_success_rate']}%
+
Http Success Rate
+
+
+
{scrapy_stats['goal_achievement']}%
+
Goal Achievement
{scrapy_stats['duration']}