diff --git a/examples/grafana/METRICS.md b/examples/grafana/METRICS.md new file mode 100644 index 0000000..fab3e92 --- /dev/null +++ b/examples/grafana/METRICS.md @@ -0,0 +1,161 @@ +# Iceberg REST Catalog Metrics + +This document describes the Prometheus metrics exposed by the Iceberg REST Catalog. + +## Dashboard Overview + +### Overview & Scan Metrics +![Overview and Scan Metrics](screenshots/overview-scan-metrics.png) + +### Commit Metrics & Delete Files +![Commit Metrics and Delete Files](screenshots/commit-delete-metrics.png) + +### HTTP/REST Endpoint Metrics +![HTTP REST Endpoint Metrics](screenshots/http-metrics.png) + +### InsertWatch (S3 Watch) Metrics +![InsertWatch S3 Watch Metrics](screenshots/insertwatch-metrics.png) + +### Maintenance Metrics +![Maintenance Metrics](screenshots/maintenance-metrics.png) + +## Metrics Reference + +### Iceberg Table Metrics + +These metrics are reported by Iceberg clients when they perform operations on tables. + +#### Scan Metrics + +| Metric Name | Type | Labels | Description | +|-------------|------|--------|-------------| +| `iceberg_scans_total` | Counter | catalog, namespace, table | Total number of Iceberg table scans | +| `iceberg_scan_result_data_files_total` | Counter | catalog, namespace, table | Total number of data files in scan results | +| `iceberg_scan_result_delete_files_total` | Counter | catalog, namespace, table | Total number of delete files in scan results | +| `iceberg_scan_indexed_delete_files_total` | Counter | catalog, namespace, table | Total number of indexed delete files in scan results | +| `iceberg_scan_positional_delete_files_total` | Counter | catalog, namespace, table | Total number of positional delete files in scan results | +| `iceberg_scan_equality_delete_files_total` | Counter | catalog, namespace, table | Total number of equality delete files in scan results | +| `iceberg_scan_total_data_manifests` | Counter | catalog, namespace, table | Total number of data manifests considered during scans | +| `iceberg_scan_total_delete_manifests` | Counter | catalog, namespace, table | Total number of delete manifests considered during scans | +| `iceberg_scan_scanned_data_manifests` | Counter | catalog, namespace, table | Total number of data manifests actually scanned | +| `iceberg_scan_skipped_data_manifests` | Counter | catalog, namespace, table | Total number of data manifests skipped during scans | +| `iceberg_scan_total_file_size_bytes` | Counter | catalog, namespace, table | Total file size in bytes for scanned data files | +| `iceberg_scan_total_delete_file_size_bytes` | Counter | catalog, namespace, table | Total file size in bytes for scanned delete files | +| `iceberg_scan_planning_duration_seconds` | Histogram | catalog, namespace, table | Duration of scan planning in seconds | +| `iceberg_scan_data_files_per_scan` | Histogram | catalog, namespace, table | Distribution of data files per scan | + +#### Commit Metrics + +| Metric Name | Type | Labels | Description | +|-------------|------|--------|-------------| +| `iceberg_commits_total` | Counter | catalog, namespace, table, operation | Total number of Iceberg table commits | +| `iceberg_commit_added_data_files_total` | Counter | catalog, namespace, table, operation | Total number of data files added in commits | +| `iceberg_commit_removed_data_files_total` | Counter | catalog, namespace, table, operation | Total number of data files removed in commits | +| `iceberg_commit_added_delete_files_total` | Counter | catalog, namespace, table, operation | Total number of delete files added in commits | +| `iceberg_commit_removed_delete_files_total` | Counter | catalog, namespace, table, operation | Total number of delete files removed in commits | +| `iceberg_commit_added_records_total` | Counter | catalog, namespace, table, operation | Total number of records added in commits | +| `iceberg_commit_removed_records_total` | Counter | catalog, namespace, table, operation | Total number of records removed in commits | +| `iceberg_commit_added_equality_deletes_total` | Counter | catalog, namespace, table, operation | Total number of equality deletes added in commits | +| `iceberg_commit_total_files_size_bytes` | Counter | catalog, namespace, table, operation | Total size in bytes of files involved in commits | +| `iceberg_commit_duration_seconds` | Histogram | catalog, namespace, table, operation | Duration of commit operations in seconds | + +#### Reporter Metrics + +| Metric Name | Type | Labels | Description | +|-------------|------|--------|-------------| +| `iceberg_metrics_reporter_active` | Counter | - | Iceberg metrics reporter status (value 1 means reporter is active) | +| `iceberg_metrics_report_errors_total` | Counter | type | Total number of errors while processing metrics reports | + +### Catalog Metrics + +These metrics track catalog-level statistics. + +| Metric Name | Type | Labels | Description | +|-------------|------|--------|-------------| +| `iceberg_catalog_tables` | Gauge | catalog | Current number of tables in the catalog | +| `iceberg_catalog_namespaces` | Gauge | catalog | Current number of namespaces in the catalog | +| `iceberg_catalog_operations_total` | Counter | catalog, operation | Total number of catalog operations (create_table, drop_table, create_namespace, drop_namespace) | + +### Table Metrics + +These metrics track table-level statistics from commit reports. + +| Metric Name | Type | Labels | Description | +|-------------|------|--------|-------------| +| `iceberg_table_snapshots_total` | Counter | catalog, namespace, table | Total number of snapshots created per table | +| `iceberg_table_schema_updates_total` | Counter | catalog, namespace, table | Total number of schema evolutions per table | + +### HTTP/REST API Metrics + +These metrics track HTTP requests to the REST catalog API. + +| Metric Name | Type | Labels | Description | +|-------------|------|--------|-------------| +| `iceberg_http_requests_total` | Counter | method, route | Total number of HTTP requests | +| `iceberg_http_responses_total` | Counter | method, route, status_class | Total number of HTTP responses by status class | +| `iceberg_http_request_duration_seconds` | Histogram | method, route | HTTP request duration in seconds | +| `iceberg_http_requests_in_flight` | Gauge | - | Number of HTTP requests currently being processed | +| `iceberg_http_response_size_bytes_total` | Counter | method, route | HTTP response size in bytes | + +### InsertWatch (S3 Watch) Metrics + +These metrics track S3 event-driven file insertions. + +| Metric Name | Type | Labels | Description | +|-------------|------|--------|-------------| +| `ice_watch_poll_requests_total` | Counter | table, queue, queue_type | Total poll requests to the message queue | +| `ice_watch_messages_received_total` | Counter | table, queue, queue_type | Total messages received from queue | +| `ice_watch_events_received_total` | Counter | table, queue, queue_type | Total S3 events received (one message may contain multiple events) | +| `ice_watch_events_matched_total` | Counter | table, queue, queue_type | Total S3 events that matched the pattern | +| `ice_watch_events_not_matched_total` | Counter | table, queue, queue_type | Total S3 events that did not match any input pattern | +| `ice_watch_events_skipped_total` | Counter | table, queue, queue_type | Total S3 events skipped (non-ObjectCreated events) | +| `ice_watch_files_inserted_total` | Counter | table, queue, queue_type | Total files inserted from S3 events | +| `ice_watch_transactions_total` | Counter | table, queue, queue_type | Total insert transactions committed | +| `ice_watch_transactions_failed_total` | Counter | table, queue, queue_type | Total failed transactions | +| `ice_watch_retry_attempts_total` | Counter | table, queue, queue_type | Total retry attempts | +| `ice_watch_queue_receive_errors_total` | Counter | table, queue, queue_type | Total errors when receiving messages from queue | +| `ice_watch_queue_delete_errors_total` | Counter | table, queue, queue_type | Total errors when deleting/acknowledging messages | +| `ice_watch_message_parse_errors_total` | Counter | table, queue, queue_type | Total message parsing errors | + +### Maintenance Metrics + +These metrics track background maintenance operations. + +| Metric Name | Type | Labels | Description | +|-------------|------|--------|-------------| +| `ice_maintenance_runs_total` | Counter | status | Total number of maintenance runs | +| `ice_maintenance_duration_seconds` | Histogram | - | Duration of maintenance run in seconds | +| `ice_maintenance_in_progress` | Gauge | - | Whether maintenance is currently running (1 = running, 0 = idle) | +| `ice_maintenance_last_run_timestamp` | Gauge | - | Unix timestamp of the last maintenance run | +| `ice_maintenance_start_timestamp` | Gauge | - | Unix timestamp when current maintenance started | +| `ice_maintenance_skipped_total` | Counter | - | Times maintenance was skipped (already in maintenance mode) | +| `ice_maintenance_orphan_files_found_total` | Counter | table | Total orphaned files discovered | +| `ice_maintenance_orphan_files_deleted_total` | Counter | table | Total orphaned files successfully deleted | +| `ice_maintenance_orphan_files_excluded_total` | Counter | table | Files excluded by whitelist | +| `ice_maintenance_orphan_delete_failures_total` | Counter | table | Files that failed to delete | +| `ice_maintenance_compaction_files_merged_total` | Counter | table | Total input files merged during compaction | +| `ice_maintenance_compaction_output_files_total` | Counter | table | Total output files produced after merge | +| `ice_maintenance_compaction_bytes_read_total` | Counter | table | Total bytes read during compaction | +| `ice_maintenance_compaction_bytes_written_total` | Counter | table | Total bytes written during compaction | + +## Label Descriptions + +| Label | Description | +|-------|-------------| +| `catalog` | Catalog name (default: "default") | +| `namespace` | Namespace/schema name (e.g., "db.schema") | +| `table` | Table name | +| `operation` | Commit operation type (e.g., "append", "replace", "overwrite") | +| `method` | HTTP method (GET, POST, DELETE, etc.) | +| `route` | REST API route name (e.g., LIST_NAMESPACES, LOAD_TABLE) | +| `status_class` | HTTP status code class (200, 400, 404, 500, etc.) | +| `status` | Maintenance run status (success, failure) or event match status | +| `type` | Error type for metrics report errors | +| `queue` | Queue URL for InsertWatch | +| `queue_type` | Queue type (e.g., "sqs", "kafka") for InsertWatch | + + +## Grafana Dashboard + +Import the dashboard from `iceberg-metrics-dashboard.json` in this directory. + diff --git a/examples/grafana/iceberg-metrics-dashboard.json b/examples/grafana/iceberg-metrics-dashboard.json new file mode 100644 index 0000000..106527a --- /dev/null +++ b/examples/grafana/iceberg-metrics-dashboard.json @@ -0,0 +1,4027 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 2, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 400, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(iceberg_catalog_tables{catalog=~\"$catalog\"}) or vector(0)", + "legendFormat": "Tables", + "range": true, + "refId": "A" + } + ], + "title": "Tables", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 401, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(iceberg_catalog_namespaces{catalog=~\"$catalog\"}) or vector(0)", + "legendFormat": "Namespaces", + "range": true, + "refId": "A" + } + ], + "title": "Namespaces", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(increase(iceberg_scans_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__range]))", + "legendFormat": "Scans", + "range": true, + "refId": "A" + } + ], + "title": "Total Scans", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(iceberg_commits_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"})", + "legendFormat": "Commits", + "range": true, + "refId": "A" + } + ], + "title": "Total Commits", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 402, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(iceberg_table_snapshots_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"})", + "legendFormat": "Snapshots", + "range": true, + "refId": "A" + } + ], + "title": "Snapshots", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "orange", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 403, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(iceberg_table_schema_updates_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"})", + "legendFormat": "Schema Updates", + "range": true, + "refId": "A" + } + ], + "title": "Schema Updates", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 18, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(iceberg_metrics_report_errors_total[$__range])) or vector(0)", + "legendFormat": "Errors", + "refId": "A" + } + ], + "title": "Metrics Errors", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(iceberg_commit_added_records_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"})", + "legendFormat": "Records Added", + "range": true, + "refId": "A" + } + ], + "title": "Records Added", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (catalog, namespace, table) (rate(iceberg_scans_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval]))", + "legendFormat": "{{catalog}}.{{namespace}}.{{table}}", + "refId": "A" + } + ], + "title": "Scan Rate by Table", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (catalog, namespace, table, operation) (rate(iceberg_commits_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval]))", + "legendFormat": "{{catalog}}.{{namespace}}.{{table}} ({{operation}})", + "refId": "A" + } + ], + "title": "Commit Rate by Table", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 404, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (catalog, operation) (rate(iceberg_catalog_operations_total{catalog=~\"$catalog\"}[$__rate_interval]))", + "legendFormat": "{{catalog}} - {{operation}}", + "refId": "A" + } + ], + "title": "Catalog Operations Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 405, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (catalog, namespace, table) (rate(iceberg_table_snapshots_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval]))", + "legendFormat": "{{catalog}}.{{namespace}}.{{table}}", + "refId": "A" + } + ], + "title": "Snapshot Creation Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 101, + "panels": [], + "title": "Scan Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le, catalog, namespace, table) (rate(iceberg_scan_planning_duration_seconds_bucket{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval])))", + "legendFormat": "p50 - {{catalog}}.{{namespace}}.{{table}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le, catalog, namespace, table) (rate(iceberg_scan_planning_duration_seconds_bucket{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval])))", + "legendFormat": "p95 - {{catalog}}.{{namespace}}.{{table}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le, catalog, namespace, table) (rate(iceberg_scan_planning_duration_seconds_bucket{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval])))", + "legendFormat": "p99 - {{catalog}}.{{namespace}}.{{table}}", + "refId": "C" + } + ], + "title": "Scan Planning Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le, catalog, namespace, table) (rate(iceberg_scan_data_files_per_scan_bucket{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval])))", + "legendFormat": "p50 - {{catalog}}.{{namespace}}.{{table}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le, catalog, namespace, table) (rate(iceberg_scan_data_files_per_scan_bucket{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval])))", + "legendFormat": "p95 - {{catalog}}.{{namespace}}.{{table}}", + "refId": "B" + } + ], + "title": "Data Files Per Scan", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 102, + "panels": [], + "title": "Commit Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (catalog, namespace, table, operation) (increase(iceberg_commit_duration_seconds_sum{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__range])) ", + "legendFormat": "p50 - {{catalog}}.{{namespace}}.{{table}} ({{operation}})", + "range": true, + "refId": "A" + } + ], + "title": "Commit Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum by (catalog, namespace, table, operation) (iceberg_commit_added_data_files_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"})", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Data Files Added/Removed", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 103, + "panels": [], + "title": "Delete Files", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 30, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (catalog, namespace, table) (rate(iceberg_scan_positional_delete_files_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval]))", + "legendFormat": "Positional - {{catalog}}.{{namespace}}.{{table}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (catalog, namespace, table) (rate(iceberg_scan_equality_delete_files_total{catalog=~\"$catalog\", namespace=~\"$namespace\", table=~\"$table\"}[$__rate_interval]))", + "legendFormat": "Equality - {{catalog}}.{{namespace}}.{{table}}", + "refId": "B" + } + ], + "title": "Delete Files Scanned by Type", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 104, + "panels": [], + "title": "HTTP/REST Endpoint Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 41 + }, + "id": 40, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(iceberg_http_requests_total[$__rate_interval]))", + "legendFormat": "Requests/sec", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.05 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 41 + }, + "id": 41, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(iceberg_http_responses_total{status_class=~\"4xx|5xx\"}[$__rate_interval])) / sum(rate(iceberg_http_responses_total[$__rate_interval])) or vector(0)", + "legendFormat": "Error Rate", + "refId": "A" + } + ], + "title": "Error Rate (4xx+5xx)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 41 + }, + "id": 42, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "iceberg_http_requests_in_flight or vector(0)", + "legendFormat": "In Flight", + "refId": "A" + } + ], + "title": "Requests In Flight", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 41 + }, + "id": 60, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(iceberg_http_response_size_bytes_total)", + "legendFormat": "Total Bytes", + "refId": "A" + } + ], + "title": "Response Size", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 45 + }, + "id": 44, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(iceberg_http_requests_total[$__rate_interval]))", + "legendFormat": "Requests/sec", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 45 + }, + "id": 43, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(iceberg_http_request_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p95 Latency", + "refId": "A" + } + ], + "title": "p95 Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*4xx.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*5xx.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 45, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (status_class) (rate(iceberg_http_responses_total[$__rate_interval]))", + "legendFormat": "{{status_class}}", + "refId": "A" + } + ], + "title": "Response Rate by Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(iceberg_http_request_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(iceberg_http_request_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(iceberg_http_request_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Request Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 47, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (route, status_class) (rate(iceberg_http_responses_total{status_class=~\"4xx|5xx\"}[$__rate_interval])) > 0", + "legendFormat": "{{route}} ({{status_class}})", + "refId": "A" + } + ], + "title": "Errors by Route", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 200, + "panels": [], + "title": "InsertWatch (S3 Watch) Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 0, + "y": 66 + }, + "id": 202, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_watch_messages_received_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__range])) or vector(0)", + "legendFormat": "Messages", + "refId": "A" + } + ], + "title": "Messages Received", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 5, + "y": 66 + }, + "id": 203, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_watch_files_inserted_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__range])) or vector(0)", + "legendFormat": "Files", + "refId": "A" + } + ], + "title": "Files Inserted", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 10, + "y": 66 + }, + "id": 204, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_watch_transactions_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__range])) or vector(0)", + "legendFormat": "Transactions", + "refId": "A" + } + ], + "title": "Total Transactions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 15, + "y": 66 + }, + "id": 205, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_watch_transactions_failed_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__range])) or vector(0)", + "legendFormat": "Failed", + "refId": "A" + } + ], + "title": "Failed Transactions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 66 + }, + "id": 206, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_watch_retry_attempts_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__range])) or vector(0)", + "legendFormat": "Retries", + "refId": "A" + } + ], + "title": "Retry Attempts", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 70 + }, + "id": 406, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_watch_poll_requests_total{table=~\"$watch_table\", queue=~\"$watch_queue\"}[$__range])) or vector(0)", + "legendFormat": "Poll Requests", + "refId": "A" + } + ], + "title": "Poll Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 210, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table, queue) (rate(ice_watch_events_matched_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "Matched - {{table}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table, queue) (rate(ice_watch_events_not_matched_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "Not Matched - {{table}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table, queue) (rate(ice_watch_events_skipped_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "Skipped - {{table}}", + "refId": "C" + } + ], + "title": "Events by Match Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 70 + }, + "id": 211, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (table, queue) (rate(ice_watch_files_inserted_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "{{table}}", + "range": true, + "refId": "A" + } + ], + "title": "Files Inserted Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*Failed.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 212, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table, queue) (rate(ice_watch_transactions_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "Success - {{table}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table, queue) (rate(ice_watch_transactions_failed_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "Failed - {{table}}", + "refId": "B" + } + ], + "title": "Transaction Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*Queue Receive.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*Queue Delete.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*Parse.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 213, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table, queue) (rate(ice_watch_queue_receive_errors_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "Queue Receive - {{table}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table, queue) (rate(ice_watch_queue_delete_errors_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "Queue Delete - {{table}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table, queue) (rate(ice_watch_message_parse_errors_total{table=~\"$watch_table\", queue=~\"$watch_queue\", instance=~\"$watch_instance\"}[$__rate_interval]))", + "legendFormat": "Parse Errors - {{table}}", + "refId": "C" + } + ], + "title": "Errors Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 86 + }, + "id": 300, + "panels": [], + "title": "Maintenance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 87 + }, + "id": 302, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_maintenance_runs_total{status=\"success\"}[$__range])) or vector(0)", + "legendFormat": "Success", + "refId": "A" + } + ], + "title": "Successful Runs", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 87 + }, + "id": 303, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_maintenance_runs_total{status=\"failure\"}[$__range])) or vector(0)", + "legendFormat": "Failed", + "refId": "A" + } + ], + "title": "Failed Runs", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 87 + }, + "id": 304, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_maintenance_orphan_files_deleted_total[$__range])) or vector(0)", + "legendFormat": "Deleted", + "refId": "A" + } + ], + "title": "Orphan Files Deleted", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 87 + }, + "id": 305, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(ice_maintenance_compaction_files_merged_total[$__range])) or vector(0)", + "legendFormat": "Merged", + "refId": "A" + } + ], + "title": "Files Compacted", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "dateTimeFromNow" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 87 + }, + "id": 306, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "max(ice_maintenance_last_run_timestamp) * 1000", + "legendFormat": "Last Run", + "range": true, + "refId": "A" + } + ], + "title": "Last Maintenance Run", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Shows how long maintenance has been running. Use this for alerting if maintenance is stuck (e.g., alert if > 1 hour).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "green", + "index": 0, + "text": "Not Running" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 300 + }, + { + "color": "orange", + "value": 1800 + }, + { + "color": "red", + "value": 3600 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 91 + }, + "id": 307, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "(time() - ice_maintenance_start_timestamp)", + "legendFormat": "Duration", + "range": true, + "refId": "A" + } + ], + "title": "Current Maintenance Duration", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*failure.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 91 + }, + "id": 311, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (status) (rate(ice_maintenance_runs_total[$__rate_interval]))", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Maintenance Run Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 95 + }, + "id": 310, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(ice_maintenance_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(ice_maintenance_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(ice_maintenance_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Maintenance Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*Deleted.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*Failed.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 103 + }, + "id": 312, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table) (rate(ice_maintenance_orphan_files_found_total[$__rate_interval]))", + "legendFormat": "Found - {{table}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table) (rate(ice_maintenance_orphan_files_deleted_total[$__rate_interval]))", + "legendFormat": "Deleted - {{table}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (table) (rate(ice_maintenance_orphan_delete_failures_total[$__rate_interval]))", + "legendFormat": "Failed - {{table}}", + "refId": "C" + } + ], + "title": "Orphan Cleanup Rate", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 38, + "tags": [ + "iceberg", + "data-lake", + "rest-catalog", + "s3-watch" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "PBFA97CFB590B2093" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values({__name__=~\"iceberg_.+\"}, catalog)", + "hide": 0, + "includeAll": true, + "label": "Catalog", + "multi": true, + "name": "catalog", + "options": [], + "query": { + "query": "label_values({__name__=~\"iceberg_.+\"}, catalog)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values({__name__=~\"iceberg_.+\", catalog=~\"$catalog\"}, namespace)", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": { + "query": "label_values({__name__=~\"iceberg_.+\", catalog=~\"$catalog\"}, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values({__name__=~\"iceberg_.+\", catalog=~\"$catalog\", namespace=~\"$namespace\"}, table)", + "hide": 0, + "includeAll": true, + "label": "Table", + "multi": true, + "name": "table", + "options": [], + "query": { + "query": "label_values({__name__=~\"iceberg_.+\", catalog=~\"$catalog\", namespace=~\"$namespace\"}, table)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values({__name__=~\"ice_watch_.+\"}, table)", + "hide": 0, + "includeAll": true, + "label": "Watch Table", + "multi": true, + "name": "watch_table", + "options": [], + "query": { + "query": "label_values({__name__=~\"ice_watch_.+\"}, table)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "http://localhost:9324/000000000000/s3-events" + ], + "value": [ + "http://localhost:9324/000000000000/s3-events" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values({__name__=~\"ice_watch_.+\", table=~\"$watch_table\"}, queue)", + "hide": 0, + "includeAll": true, + "label": "Watch Queue", + "multi": true, + "name": "watch_queue", + "options": [], + "query": { + "query": "label_values({__name__=~\"ice_watch_.+\", table=~\"$watch_table\"}, queue)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values({__name__=~\"ice_watch_.+\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "Watch Instance", + "multi": true, + "name": "watch_instance", + "options": [], + "query": { + "query": "label_values({__name__=~\"ice_watch_.+\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Iceberg REST Catalog Metrics", + "uid": "iceberg-rest-catalog-metrics", + "version": 41, + "weekStart": "" +} diff --git a/examples/grafana/screenshots/commit-delete-metrics.png b/examples/grafana/screenshots/commit-delete-metrics.png new file mode 100644 index 0000000..622db13 Binary files /dev/null and b/examples/grafana/screenshots/commit-delete-metrics.png differ diff --git a/examples/grafana/screenshots/http-metrics.png b/examples/grafana/screenshots/http-metrics.png new file mode 100644 index 0000000..fb6b167 Binary files /dev/null and b/examples/grafana/screenshots/http-metrics.png differ diff --git a/examples/grafana/screenshots/insertwatch-metrics.png b/examples/grafana/screenshots/insertwatch-metrics.png new file mode 100644 index 0000000..ead7c33 Binary files /dev/null and b/examples/grafana/screenshots/insertwatch-metrics.png differ diff --git a/examples/grafana/screenshots/maintenance-metrics.png b/examples/grafana/screenshots/maintenance-metrics.png new file mode 100644 index 0000000..4141065 Binary files /dev/null and b/examples/grafana/screenshots/maintenance-metrics.png differ diff --git a/examples/grafana/screenshots/overview-scan-metrics.png b/examples/grafana/screenshots/overview-scan-metrics.png new file mode 100644 index 0000000..8cd3118 Binary files /dev/null and b/examples/grafana/screenshots/overview-scan-metrics.png differ diff --git a/examples/s3watch/test/README.md b/examples/s3watch/test/README.md new file mode 100644 index 0000000..35801bc --- /dev/null +++ b/examples/s3watch/test/README.md @@ -0,0 +1,28 @@ +### Local testing + +The `ice insert --watch` can also be tested locally with a ElasticMQ server (which is SQS compatible) + +### Start the ElasticMQ server +`docker compose up` + +### Start ice in insert mode for local ElasticMQ server +`ice insert flowers.iris -p --no-copy --skip-duplicates \ + s3://bucket1/flowers/iris/external-data/ \ + --watch="http://localhost:9324/000000000000/s3-events" \ + --watch-endpoint="http://localhost:9324" + ` + +### Insert a S3 notification message(test) to ElasticMQ +``` +export AWS_ACCESS_KEY_ID=x +export AWS_SECRET_ACCESS_KEY=x +export AWS_REGION=us-east-1 + +aws --endpoint-url http://localhost:9324 sqs send-message \ + --queue-url http://localhost:9324/000000000000/s3-events \ + --message-body '{"Records":[{"eventName":"ObjectCreated:Put","s3":{"bucket":{"name":"bucket1"},"object":{"key":"flowers/iris/external-data/iris.parquet"}}}]}' +{ + "MD5OfMessageBody": "0ca3828dbdd1604d4b22fdfcb1226996", + "MessageId": "570bfd40-c0be-49f8-8119-25b74aad0894" +} +``` diff --git a/examples/s3watch/test/docker-compose.yaml b/examples/s3watch/test/docker-compose.yaml new file mode 100644 index 0000000..3bf72c4 --- /dev/null +++ b/examples/s3watch/test/docker-compose.yaml @@ -0,0 +1,9 @@ +services: + elasticmq: + image: softwaremill/elasticmq-native:1.6.15 + restart: unless-stopped + ports: + - '9324:9324' # SQS API + - '9325:9325' # Web UI + volumes: + - ./elasticmq.conf:/opt/elasticmq.conf:ro diff --git a/examples/s3watch/test/elasticmq.conf b/examples/s3watch/test/elasticmq.conf new file mode 100644 index 0000000..afee5de --- /dev/null +++ b/examples/s3watch/test/elasticmq.conf @@ -0,0 +1,44 @@ +include classpath("application.conf") + +node-address { + protocol = http + host = "*" + port = 9324 + context-path = "" +} + +rest-sqs { + enabled = true + bind-port = 9324 + bind-hostname = "0.0.0.0" + sqs-limits = strict +} + +rest-stats { + enabled = true + bind-port = 9325 + bind-hostname = "0.0.0.0" +} + +queues { + s3-events { + defaultVisibilityTimeout = 30 seconds + delay = 0 seconds + receiveMessageWait = 0 seconds + deadLettersQueue { + name = "s3-events-dlq" + maxReceiveCount = 3 + } + } + s3-events-dlq { + defaultVisibilityTimeout = 30 seconds + delay = 0 seconds + receiveMessageWait = 0 seconds + } +} + +aws { + region = elasticmq + accountId = 000000000000 +} + diff --git a/examples/scratch/README.md b/examples/scratch/README.md index e2e50bb..339b984 100644 --- a/examples/scratch/README.md +++ b/examples/scratch/README.md @@ -34,7 +34,11 @@ ice insert nyc.taxis -p \ ice insert nyc.taxis_p_by_day -p \ https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet \ --partition='[{"column":"tpep_pickup_datetime","transform":"day"}]' - + +# delete partition +ice delete nyc.taxis_p_by_day \ + --partition '[{"name": "tpep_pickup_datetime", "values": ["2024-12-31T23:51:20"]}]' --dry-run=false + # insert data ordered by tpep_pickup_datetime column ice insert nyc.taxis_s_by_day -p \ https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet \ diff --git a/ice-rest-catalog/README.md b/ice-rest-catalog/README.md index ffca398..de2a9d7 100644 --- a/ice-rest-catalog/README.md +++ b/ice-rest-catalog/README.md @@ -9,4 +9,4 @@ create `.ice-rest-catalog.yaml` (schema defined [here](src/main/java/com/altinit and then execute `ice-rest-catalog`. That's it. -Examples of `.ice-rest-catalog.yaml` (as well as Kubernetes deployment manifests) can be found [here](../examples/). +Examples of `.ice-rest-catalog.yaml` (as well as Kubernetes deployment manifests) can be found [here](../examples/). diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/Main.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/Main.java index c842705..53a571c 100644 --- a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/Main.java +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/Main.java @@ -27,6 +27,8 @@ import com.altinity.ice.rest.catalog.internal.maintenance.ManifestCompaction; import com.altinity.ice.rest.catalog.internal.maintenance.OrphanCleanup; import com.altinity.ice.rest.catalog.internal.maintenance.SnapshotCleanup; +import com.altinity.ice.rest.catalog.internal.metrics.CatalogMetrics; +import com.altinity.ice.rest.catalog.internal.metrics.PrometheusMetricsReporter; import com.altinity.ice.rest.catalog.internal.rest.RESTCatalogAdapter; import com.altinity.ice.rest.catalog.internal.rest.RESTCatalogAuthorizationHandler; import com.altinity.ice.rest.catalog.internal.rest.RESTCatalogHandler; @@ -51,6 +53,8 @@ import org.apache.iceberg.CatalogUtil; import org.apache.iceberg.aws.s3.S3FileIOProperties; import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.SupportsNamespaces; import org.apache.iceberg.relocated.com.google.common.base.Function; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.eclipse.jetty.server.Server; @@ -201,9 +205,38 @@ void performMaintenance( } } + private static void initializeCatalogMetrics(Catalog catalog) { + try { + CatalogMetrics metrics = CatalogMetrics.getInstance(); + String catalogName = catalog.name(); + + // Count namespaces + if (catalog instanceof SupportsNamespaces nsCatalog) { + long namespaceCount = nsCatalog.listNamespaces().size(); + metrics.setNamespacesTotal(catalogName, namespaceCount); + logger.info("Initialized namespace count: {}", namespaceCount); + + // Count tables across all namespaces + long tableCount = 0; + for (Namespace ns : nsCatalog.listNamespaces()) { + tableCount += catalog.listTables(ns).size(); + } + metrics.setTablesTotal(catalogName, tableCount); + logger.info("Initialized table count: {}", tableCount); + } + } catch (Exception e) { + logger.warn("Failed to initialize catalog metrics: {}", e.getMessage()); + } + } + private static Server createServer( - String host, int port, Catalog catalog, Config config, Map icebergConfig) { - var s = createBaseServer(catalog, config, icebergConfig, true); + String host, + int port, + Catalog catalog, + Config config, + Map icebergConfig, + PrometheusMetricsReporter metricsReporter) { + var s = createBaseServer(catalog, config, icebergConfig, true, metricsReporter); ServerConnector connector = new ServerConnector(s); connector.setHost(host); connector.setPort(port); @@ -212,8 +245,13 @@ private static Server createServer( } private static Server createAdminServer( - String host, int port, Catalog catalog, Config config, Map icebergConfig) { - var s = createBaseServer(catalog, config, icebergConfig, false); + String host, + int port, + Catalog catalog, + Config config, + Map icebergConfig, + PrometheusMetricsReporter metricsReporter) { + var s = createBaseServer(catalog, config, icebergConfig, false, metricsReporter); ServerConnector connector = new ServerConnector(s); connector.setHost(host); connector.setPort(port); @@ -222,7 +260,11 @@ private static Server createAdminServer( } private static Server createBaseServer( - Catalog catalog, Config config, Map icebergConfig, boolean requireAuth) { + Catalog catalog, + Config config, + Map icebergConfig, + boolean requireAuth, + PrometheusMetricsReporter metricsReporter) { var mux = new ServletContextHandler(ServletContextHandler.NO_SESSIONS); mux.insertHandler(new GzipHandler()); // TODO: RequestLogHandler @@ -372,6 +414,9 @@ public Integer call() throws Exception { var catalog = loadCatalog(config, icebergConfig); + // Initialize catalog metrics with current counts + initializeCatalogMetrics(catalog); + ObjectMapper om = new ObjectMapper(); for (Config.Token t : config.bearerTokens()) { @@ -401,6 +446,9 @@ public Integer call() throws Exception { logger.info("Catalog maintenance disabled (no maintenance schedule specified)"); } + // Initialize Iceberg metrics reporter for Prometheus (singleton) + PrometheusMetricsReporter metricsReporter = PrometheusMetricsReporter.getInstance(); + // TODO: ensure all http handlers are hooked in JvmMetrics.builder().register(); @@ -414,14 +462,21 @@ public Integer call() throws Exception { adminHostAndPort.getPort(), catalog, config, - icebergConfig); + icebergConfig, + metricsReporter); adminServer.start(); logger.warn("Serving admin endpoint at http://{}/v1/{config,*}", adminHostAndPort); } HostAndPort hostAndPort = HostAndPort.fromString(config.addr()); Server httpServer = - createServer(hostAndPort.getHost(), hostAndPort.getPort(), catalog, config, icebergConfig); + createServer( + hostAndPort.getHost(), + hostAndPort.getPort(), + catalog, + config, + icebergConfig, + metricsReporter); httpServer.start(); logger.info("Serving http://{}/v1/{config,*}", hostAndPort); diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/DataCompaction.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/DataCompaction.java index 55e23fe..df7c4c4 100644 --- a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/DataCompaction.java +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/DataCompaction.java @@ -13,6 +13,7 @@ import com.altinity.ice.cli.internal.iceberg.RecordComparator; import com.altinity.ice.internal.iceberg.io.SchemeFileIO; import com.altinity.ice.internal.strings.Strings; +import com.altinity.ice.rest.catalog.internal.metrics.MaintenanceMetrics; import java.io.IOException; import java.io.UncheckedIOException; import java.util.ArrayList; @@ -146,6 +147,8 @@ private void merge( FileIO tableIO = table.io(); Schema tableSchema = table.schema(); PartitionSpec tableSpec = table.spec(); + String tableName = table.name(); + MaintenanceMetrics maintenanceMetrics = MaintenanceMetrics.getInstance(); Transaction tx = table.newTransaction(); @@ -163,6 +166,10 @@ private void merge( return; } + // Calculate total bytes to read + long totalBytesToRead = dataFiles.stream().mapToLong(DataFile::fileSizeInBytes).sum(); + maintenanceMetrics.recordCompactionBytesRead(tableName, totalBytesToRead); + OutputFile outputFile = tableIO.newOutputFile(Strings.replacePrefix(dstDataFile, "s3://", "s3a://")); @@ -252,5 +259,10 @@ private void merge( delOp.commit(); tx.commitTransaction(); + + // Record metrics after successful commit + maintenanceMetrics.recordCompactionFilesMerged(tableName, dataFiles.size()); + maintenanceMetrics.recordCompactionOutputFile(tableName); + maintenanceMetrics.recordCompactionBytesWritten(tableName, dataFileSizeInBytes); } } diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/MaintenanceScheduler.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/MaintenanceScheduler.java index 2840415..107c772 100644 --- a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/MaintenanceScheduler.java +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/MaintenanceScheduler.java @@ -9,6 +9,7 @@ */ package com.altinity.ice.rest.catalog.internal.maintenance; +import com.altinity.ice.rest.catalog.internal.metrics.MaintenanceMetrics; import com.github.shyiko.skedule.Schedule; import java.time.ZonedDateTime; import java.util.concurrent.ScheduledExecutorService; @@ -74,22 +75,36 @@ private void scheduleNextMaintenance() { } public void performMaintenance() { - if (isMaintenanceMode.get()) { - logger.info("Skipping maintenance task as system is already in maintenance mode"); - return; - } + long startTime = System.nanoTime(); + boolean success = false; + MaintenanceMetrics metrics = null; try { + + metrics = MaintenanceMetrics.getInstance(); + + if (isMaintenanceMode.get()) { + logger.info("Skipping maintenance task as system is already in maintenance mode"); + metrics.recordMaintenanceSkipped(); + return; + } + logger.info("Starting scheduled maintenance task"); setMaintenanceMode(true); + metrics.recordMaintenanceStarted(); maintenanceRunner.run(); logger.info("Scheduled maintenance task completed successfully"); + success = true; } catch (Exception e) { logger.error("Error during scheduled maintenance task", e); } finally { setMaintenanceMode(false); + double durationSecs = (System.nanoTime() - startTime) / 1_000_000_000.0; + if (metrics != null) { + metrics.recordMaintenanceCompleted(success, durationSecs); + } } } diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/OrphanCleanup.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/OrphanCleanup.java index 94c28ab..ab2389d 100644 --- a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/OrphanCleanup.java +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/OrphanCleanup.java @@ -11,6 +11,7 @@ import com.altinity.ice.internal.iceberg.io.SchemeFileIO; import com.altinity.ice.internal.io.Matcher; +import com.altinity.ice.rest.catalog.internal.metrics.MaintenanceMetrics; import java.io.IOException; import java.io.UncheckedIOException; import java.util.ArrayDeque; @@ -21,6 +22,7 @@ import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.BaseTable; import org.apache.iceberg.DataFile; import org.apache.iceberg.ManifestFile; @@ -47,6 +49,8 @@ public record OrphanCleanup(long olderThanMillis, Matcher whitelist, boolean dry @Override public void perform(Table table) throws IOException { String location = table.location(); + String tableName = table.name(); + MaintenanceMetrics metrics = MaintenanceMetrics.getInstance(); logger.info("Searching for orphaned files at {}", location); @@ -64,6 +68,10 @@ public void perform(Table table) throws IOException { logger.info("Found {} orphaned file(s) ({} excluded)", orphanedFiles.size(), excluded); + // Record metrics + metrics.recordOrphanFilesFound(tableName, orphanedFiles.size() + excluded); + metrics.recordOrphanFilesExcluded(tableName, excluded); + if (orphanedFiles.isEmpty()) { return; } @@ -71,6 +79,9 @@ public void perform(Table table) throws IOException { if (!dryRun) { FileIO tableIO = table.io(); + AtomicInteger deletedCount = new AtomicInteger(0); + AtomicInteger failedCount = new AtomicInteger(0); + int numThreads = Math.min(8, orphanedFiles.size()); try (ExecutorService executor = Executors.newFixedThreadPool(numThreads)) { orphanedFiles.forEach( @@ -80,13 +91,21 @@ public void perform(Table table) throws IOException { try { logger.info("Deleting {}", file); tableIO.deleteFile(file); + deletedCount.incrementAndGet(); return file; } catch (Exception e) { logger.warn("Failed to delete file {}", file, e); + failedCount.incrementAndGet(); return null; } })); } + + // Record deletion metrics + metrics.recordOrphanFilesDeleted(tableName, deletedCount.get()); + if (failedCount.get() > 0) { + metrics.recordOrphanDeleteFailure(tableName, failedCount.get()); + } } else { orphanedFiles.stream().sorted().forEach(file -> logger.info("To be deleted: {}", file)); } diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/CatalogMetrics.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/CatalogMetrics.java new file mode 100644 index 0000000..ce7a4d9 --- /dev/null +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/CatalogMetrics.java @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.altinity.ice.rest.catalog.internal.metrics; + +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.CATALOG_NAMESPACES_HELP; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.CATALOG_NAMESPACES_NAME; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.CATALOG_OPERATIONS_TOTAL_HELP; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.CATALOG_OPERATIONS_TOTAL_NAME; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.CATALOG_OPERATION_LABELS; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.CATALOG_TABLES_HELP; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.CATALOG_TABLES_NAME; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.LABEL_CATALOG; + +import io.prometheus.metrics.core.metrics.Counter; +import io.prometheus.metrics.core.metrics.Gauge; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Prometheus metrics for catalog-level statistics. + * + *

Tracks: + * + *

    + *
  • Total number of tables (gauge) + *
  • Total number of namespaces (gauge) + *
  • Catalog operations counter (create/drop table/namespace) + *
+ */ +public class CatalogMetrics { + + private static final Logger logger = LoggerFactory.getLogger(CatalogMetrics.class); + + // Initialization-on-Demand Holder for thread-safe lazy singleton + private static class Holder { + private static final CatalogMetrics INSTANCE = new CatalogMetrics(); + } + + // Operation types for the operations counter + public static final String OP_CREATE_TABLE = "create_table"; + public static final String OP_DROP_TABLE = "drop_table"; + public static final String OP_CREATE_NAMESPACE = "create_namespace"; + public static final String OP_DROP_NAMESPACE = "drop_namespace"; + + private final Gauge tablesTotal; + private final Gauge namespacesTotal; + private final Counter operationsTotal; + + /** Returns the singleton instance of the catalog metrics. */ + public static CatalogMetrics getInstance() { + return Holder.INSTANCE; + } + + private CatalogMetrics() { + this.tablesTotal = + Gauge.builder() + .name(CATALOG_TABLES_NAME) + .help(CATALOG_TABLES_HELP) + .labelNames(LABEL_CATALOG) + .register(); + + this.namespacesTotal = + Gauge.builder() + .name(CATALOG_NAMESPACES_NAME) + .help(CATALOG_NAMESPACES_HELP) + .labelNames(LABEL_CATALOG) + .register(); + + this.operationsTotal = + Counter.builder() + .name(CATALOG_OPERATIONS_TOTAL_NAME) + .help(CATALOG_OPERATIONS_TOTAL_HELP) + .labelNames(CATALOG_OPERATION_LABELS) + .register(); + + logger.info("Catalog Prometheus metrics initialized"); + } + + /** Set the total number of tables in the catalog. */ + public void setTablesTotal(String catalog, long count) { + tablesTotal.labelValues(catalog).set(count); + } + + /** Increment the total number of tables in the catalog. */ + public void incrementTablesTotal(String catalog) { + tablesTotal.labelValues(catalog).inc(); + } + + /** Decrement the total number of tables in the catalog. */ + public void decrementTablesTotal(String catalog) { + tablesTotal.labelValues(catalog).dec(); + } + + /** Set the total number of namespaces in the catalog. */ + public void setNamespacesTotal(String catalog, long count) { + namespacesTotal.labelValues(catalog).set(count); + } + + /** Increment the total number of namespaces in the catalog. */ + public void incrementNamespacesTotal(String catalog) { + namespacesTotal.labelValues(catalog).inc(); + } + + /** Decrement the total number of namespaces in the catalog. */ + public void decrementNamespacesTotal(String catalog) { + namespacesTotal.labelValues(catalog).dec(); + } + + /** Record a catalog operation. */ + public void recordOperation(String catalog, String operation) { + operationsTotal.labelValues(catalog, operation).inc(); + } + + /** Record a table creation. */ + public void recordTableCreated(String catalog) { + incrementTablesTotal(catalog); + recordOperation(catalog, OP_CREATE_TABLE); + } + + /** Record a table drop. */ + public void recordTableDropped(String catalog) { + decrementTablesTotal(catalog); + recordOperation(catalog, OP_DROP_TABLE); + } + + /** Record a namespace creation. */ + public void recordNamespaceCreated(String catalog) { + incrementNamespacesTotal(catalog); + recordOperation(catalog, OP_CREATE_NAMESPACE); + } + + /** Record a namespace drop. */ + public void recordNamespaceDropped(String catalog) { + decrementNamespacesTotal(catalog); + recordOperation(catalog, OP_DROP_NAMESPACE); + } +} diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/HttpMetrics.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/HttpMetrics.java new file mode 100644 index 0000000..ea34aee --- /dev/null +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/HttpMetrics.java @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.altinity.ice.rest.catalog.internal.metrics; + +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_DURATION_BUCKETS; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_REQUESTS_IN_FLIGHT_HELP; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_REQUESTS_IN_FLIGHT_NAME; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_REQUESTS_TOTAL_HELP; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_REQUESTS_TOTAL_NAME; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_REQUEST_DURATION_HELP; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_REQUEST_DURATION_NAME; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_REQUEST_LABELS; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_RESPONSES_TOTAL_HELP; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_RESPONSES_TOTAL_NAME; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_RESPONSE_LABELS; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_RESPONSE_SIZE_BYTES_HELP; +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.HTTP_RESPONSE_SIZE_BYTES_NAME; + +import io.prometheus.metrics.core.metrics.Counter; +import io.prometheus.metrics.core.metrics.Gauge; +import io.prometheus.metrics.core.metrics.Histogram; +import java.util.concurrent.TimeUnit; + +public class HttpMetrics { + + private static class Holder { + private static final HttpMetrics INSTANCE = new HttpMetrics(); + } + + private final Counter requestsTotal; + private final Counter responsesTotal; + private final Histogram requestDuration; + private final Gauge requestsInFlight; + private final Counter responseSizeBytes; + + public static HttpMetrics getInstance() { + return Holder.INSTANCE; + } + + private HttpMetrics() { + this.requestsTotal = + Counter.builder() + .name(HTTP_REQUESTS_TOTAL_NAME) + .help(HTTP_REQUESTS_TOTAL_HELP) + .labelNames(HTTP_REQUEST_LABELS) + .register(); + + this.responsesTotal = + Counter.builder() + .name(HTTP_RESPONSES_TOTAL_NAME) + .help(HTTP_RESPONSES_TOTAL_HELP) + .labelNames(HTTP_RESPONSE_LABELS) + .register(); + + this.requestDuration = + Histogram.builder() + .name(HTTP_REQUEST_DURATION_NAME) + .help(HTTP_REQUEST_DURATION_HELP) + .labelNames(HTTP_REQUEST_LABELS) + .classicUpperBounds(HTTP_DURATION_BUCKETS) + .register(); + + this.requestsInFlight = + Gauge.builder() + .name(HTTP_REQUESTS_IN_FLIGHT_NAME) + .help(HTTP_REQUESTS_IN_FLIGHT_HELP) + .register(); + + this.responseSizeBytes = + Counter.builder() + .name(HTTP_RESPONSE_SIZE_BYTES_NAME) + .help(HTTP_RESPONSE_SIZE_BYTES_HELP) + .labelNames(HTTP_REQUEST_LABELS) + .register(); + + // Initialize with zero to make metrics visible immediately + this.responseSizeBytes.labelValues("GET", "CONFIG").inc(0); + } + + public void recordRequestStart(String method, String route) { + requestsTotal.labelValues(method, route).inc(); + requestsInFlight.inc(); + } + + public void recordRequestEnd( + String method, String route, int statusCode, long startTimeNanos, long responseSize) { + requestsInFlight.dec(); + + double durationSeconds = + (System.nanoTime() - startTimeNanos) / (double) TimeUnit.SECONDS.toNanos(1); + requestDuration.labelValues(method, route).observe(durationSeconds); + responsesTotal.labelValues(method, route, Integer.toString(statusCode)).inc(); + if (responseSize > 0) { + responseSizeBytes.labelValues(method, route).inc(responseSize); + } + } + + public RequestTimer startRequest(String method, String route) { + return new RequestTimer(this, method, route); + } + + public static class RequestTimer implements AutoCloseable { + private final HttpMetrics metrics; + private final String method; + private final String route; + private final long startTimeNanos; + private int statusCode = 200; + private long responseSize = 0; + + RequestTimer(HttpMetrics metrics, String method, String route) { + this.metrics = metrics; + this.method = method; + this.route = route; + this.startTimeNanos = System.nanoTime(); + metrics.recordRequestStart(method, route); + } + + /** Set the status code before closing. Default is 200. */ + public void setStatusCode(int statusCode) { + this.statusCode = statusCode; + } + + /** Set the response size in bytes before closing. */ + public void setResponseSize(long responseSize) { + this.responseSize = responseSize; + } + + @Override + public void close() { + metrics.recordRequestEnd(method, route, statusCode, startTimeNanos, responseSize); + } + } +} diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/IcebergMetricNames.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/IcebergMetricNames.java new file mode 100644 index 0000000..c189eea --- /dev/null +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/IcebergMetricNames.java @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.altinity.ice.rest.catalog.internal.metrics; + +/** Constants for Iceberg Prometheus metric names, help strings, and labels. */ +public final class IcebergMetricNames { + + private IcebergMetricNames() {} + + // ========================================================================== + // Labels + // ========================================================================== + + public static final String LABEL_CATALOG = "catalog"; + public static final String LABEL_NAMESPACE = "namespace"; + public static final String LABEL_TABLE = "table"; + public static final String LABEL_OPERATION = "operation"; + public static final String LABEL_TYPE = "type"; + + public static final String[] SCAN_LABELS = {LABEL_CATALOG, LABEL_NAMESPACE, LABEL_TABLE}; + public static final String[] COMMIT_LABELS = { + LABEL_CATALOG, LABEL_NAMESPACE, LABEL_TABLE, LABEL_OPERATION + }; + + // ========================================================================== + // Reporter Info Metrics + // ========================================================================== + + public static final String REPORTER_ACTIVE_NAME = "iceberg_metrics_reporter_active"; + public static final String REPORTER_ACTIVE_HELP = + "Iceberg metrics reporter status (value 1 means reporter is active)"; + + public static final String REPORT_ERRORS_NAME = "iceberg_metrics_report_errors_total"; + public static final String REPORT_ERRORS_HELP = + "Total number of errors while processing metrics reports"; + + // ========================================================================== + // Scan Metrics + // ========================================================================== + + public static final String SCANS_TOTAL_NAME = "iceberg_scans_total"; + public static final String SCANS_TOTAL_HELP = "Total number of Iceberg table scans"; + + public static final String SCAN_RESULT_DATA_FILES_NAME = "iceberg_scan_result_data_files_total"; + public static final String SCAN_RESULT_DATA_FILES_HELP = + "Total number of data files in scan results"; + + public static final String SCAN_RESULT_DELETE_FILES_NAME = + "iceberg_scan_result_delete_files_total"; + public static final String SCAN_RESULT_DELETE_FILES_HELP = + "Total number of delete files in scan results"; + + public static final String SCAN_INDEXED_DELETE_FILES_NAME = + "iceberg_scan_indexed_delete_files_total"; + public static final String SCAN_INDEXED_DELETE_FILES_HELP = + "Total number of indexed delete files in scan results"; + + public static final String SCAN_POSITIONAL_DELETE_FILES_NAME = + "iceberg_scan_positional_delete_files_total"; + public static final String SCAN_POSITIONAL_DELETE_FILES_HELP = + "Total number of positional delete files in scan results"; + + public static final String SCAN_EQUALITY_DELETE_FILES_NAME = + "iceberg_scan_equality_delete_files_total"; + public static final String SCAN_EQUALITY_DELETE_FILES_HELP = + "Total number of equality delete files in scan results"; + + public static final String SCAN_TOTAL_DATA_MANIFESTS_NAME = "iceberg_scan_total_data_manifests"; + public static final String SCAN_TOTAL_DATA_MANIFESTS_HELP = + "Total number of data manifests considered during scans"; + + public static final String SCAN_TOTAL_DELETE_MANIFESTS_NAME = + "iceberg_scan_total_delete_manifests"; + public static final String SCAN_TOTAL_DELETE_MANIFESTS_HELP = + "Total number of delete manifests considered during scans"; + + public static final String SCAN_SCANNED_DATA_MANIFESTS_NAME = + "iceberg_scan_scanned_data_manifests"; + public static final String SCAN_SCANNED_DATA_MANIFESTS_HELP = + "Total number of data manifests actually scanned"; + + public static final String SCAN_SKIPPED_DATA_MANIFESTS_NAME = + "iceberg_scan_skipped_data_manifests"; + public static final String SCAN_SKIPPED_DATA_MANIFESTS_HELP = + "Total number of data manifests skipped during scans"; + + public static final String SCAN_TOTAL_FILE_SIZE_BYTES_NAME = "iceberg_scan_total_file_size_bytes"; + public static final String SCAN_TOTAL_FILE_SIZE_BYTES_HELP = + "Total file size in bytes for scanned data files"; + + public static final String SCAN_TOTAL_DELETE_FILE_SIZE_BYTES_NAME = + "iceberg_scan_total_delete_file_size_bytes"; + public static final String SCAN_TOTAL_DELETE_FILE_SIZE_BYTES_HELP = + "Total file size in bytes for scanned delete files"; + + public static final String SCAN_PLANNING_DURATION_NAME = "iceberg_scan_planning_duration_seconds"; + public static final String SCAN_PLANNING_DURATION_HELP = "Duration of scan planning in seconds"; + + public static final String SCAN_DATA_FILES_PER_SCAN_NAME = "iceberg_scan_data_files_per_scan"; + public static final String SCAN_DATA_FILES_PER_SCAN_HELP = "Distribution of data files per scan"; + + // ========================================================================== + // Commit Metrics + // ========================================================================== + + public static final String COMMITS_TOTAL_NAME = "iceberg_commits_total"; + public static final String COMMITS_TOTAL_HELP = "Total number of Iceberg table commits"; + + public static final String COMMIT_ADDED_DATA_FILES_NAME = "iceberg_commit_added_data_files_total"; + public static final String COMMIT_ADDED_DATA_FILES_HELP = + "Total number of data files added in commits"; + + public static final String COMMIT_REMOVED_DATA_FILES_NAME = + "iceberg_commit_removed_data_files_total"; + public static final String COMMIT_REMOVED_DATA_FILES_HELP = + "Total number of data files removed in commits"; + + public static final String COMMIT_ADDED_DELETE_FILES_NAME = + "iceberg_commit_added_delete_files_total"; + public static final String COMMIT_ADDED_DELETE_FILES_HELP = + "Total number of delete files added in commits"; + + public static final String COMMIT_REMOVED_DELETE_FILES_NAME = + "iceberg_commit_removed_delete_files_total"; + public static final String COMMIT_REMOVED_DELETE_FILES_HELP = + "Total number of delete files removed in commits"; + + public static final String COMMIT_ADDED_RECORDS_NAME = "iceberg_commit_added_records_total"; + public static final String COMMIT_ADDED_RECORDS_HELP = "Total number of records added in commits"; + + public static final String COMMIT_REMOVED_RECORDS_NAME = "iceberg_commit_removed_records_total"; + public static final String COMMIT_REMOVED_RECORDS_HELP = + "Total number of records removed in commits"; + + public static final String COMMIT_ADDED_EQUALITY_DELETES_NAME = + "iceberg_commit_added_equality_deletes_total"; + public static final String COMMIT_ADDED_EQUALITY_DELETES_HELP = + "Total number of equality deletes added in commits"; + + public static final String COMMIT_TOTAL_FILES_SIZE_BYTES_NAME = + "iceberg_commit_total_files_size_bytes"; + public static final String COMMIT_TOTAL_FILES_SIZE_BYTES_HELP = + "Total size in bytes of files involved in commits"; + + public static final String COMMIT_DURATION_NAME = "iceberg_commit_duration_seconds"; + public static final String COMMIT_DURATION_HELP = "Duration of commit operations in seconds"; + + // ========================================================================== + // Histogram Buckets + // ========================================================================== + + /** Duration histogram buckets (in seconds) - suitable for typical Iceberg operations. */ + public static final double[] DURATION_BUCKETS = { + 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60 + }; + + // ========================================================================== + // HTTP/REST Endpoint Metrics + // ========================================================================== + + public static final String LABEL_METHOD = "method"; + public static final String LABEL_ROUTE = "route"; + public static final String LABEL_STATUS_CLASS = "status_class"; + + public static final String[] HTTP_REQUEST_LABELS = {LABEL_METHOD, LABEL_ROUTE}; + public static final String[] HTTP_RESPONSE_LABELS = { + LABEL_METHOD, LABEL_ROUTE, LABEL_STATUS_CLASS + }; + + public static final String HTTP_REQUESTS_TOTAL_NAME = "iceberg_http_requests_total"; + public static final String HTTP_REQUESTS_TOTAL_HELP = "Total number of HTTP requests"; + + public static final String HTTP_REQUEST_DURATION_NAME = "iceberg_http_request_duration_seconds"; + public static final String HTTP_REQUEST_DURATION_HELP = "HTTP request duration in seconds"; + + public static final String HTTP_RESPONSES_TOTAL_NAME = "iceberg_http_responses_total"; + public static final String HTTP_RESPONSES_TOTAL_HELP = + "Total number of HTTP responses by status class"; + + public static final String HTTP_REQUESTS_IN_FLIGHT_NAME = "iceberg_http_requests_in_flight"; + public static final String HTTP_REQUESTS_IN_FLIGHT_HELP = + "Number of HTTP requests currently being processed"; + + public static final String HTTP_RESPONSE_SIZE_BYTES_NAME = "iceberg_http_response_size_bytes"; + public static final String HTTP_RESPONSE_SIZE_BYTES_HELP = "HTTP response size in bytes"; + + /** HTTP request duration buckets (in seconds) - suitable for REST API calls. */ + public static final double[] HTTP_DURATION_BUCKETS = { + 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10 + }; + + // ========================================================================== + // Catalog-Level Metrics + // ========================================================================== + + public static final String CATALOG_TABLES_NAME = "iceberg_catalog_tables"; + public static final String CATALOG_TABLES_HELP = "Current number of tables in the catalog"; + + public static final String CATALOG_NAMESPACES_NAME = "iceberg_catalog_namespaces"; + public static final String CATALOG_NAMESPACES_HELP = + "Current number of namespaces in the catalog"; + + public static final String CATALOG_OPERATIONS_TOTAL_NAME = "iceberg_catalog_operations_total"; + public static final String CATALOG_OPERATIONS_TOTAL_HELP = + "Total number of catalog operations (create/drop table/namespace)"; + + public static final String[] CATALOG_OPERATION_LABELS = {LABEL_CATALOG, LABEL_OPERATION}; + + // ========================================================================== + // Table-Level Metrics (from CommitReport) + // ========================================================================== + + public static final String TABLE_SNAPSHOTS_TOTAL_NAME = "iceberg_table_snapshots_total"; + public static final String TABLE_SNAPSHOTS_TOTAL_HELP = "Total number of snapshots per table"; + + public static final String TABLE_SCHEMA_UPDATES_TOTAL_NAME = "iceberg_table_schema_updates_total"; + public static final String TABLE_SCHEMA_UPDATES_TOTAL_HELP = + "Total number of schema evolutions per table"; +} diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/MaintenanceMetrics.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/MaintenanceMetrics.java new file mode 100644 index 0000000..8ebd869 --- /dev/null +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/MaintenanceMetrics.java @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.altinity.ice.rest.catalog.internal.metrics; + +import io.prometheus.metrics.core.metrics.Counter; +import io.prometheus.metrics.core.metrics.Gauge; +import io.prometheus.metrics.core.metrics.Histogram; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class MaintenanceMetrics { + + private static final Logger logger = LoggerFactory.getLogger(MaintenanceMetrics.class); + + private static class Holder { + private static final MaintenanceMetrics INSTANCE = new MaintenanceMetrics(); + } + + // ========================================================================== + // Labels + // ========================================================================== + + private static final String LABEL_STATUS = "status"; + private static final String LABEL_TABLE = "table"; + + private static final String[] STATUS_LABELS = {LABEL_STATUS}; + + // ========================================================================== + // General Maintenance Metrics + // ========================================================================== + + private static final String RUNS_TOTAL_NAME = "ice_maintenance_runs_total"; + private static final String RUNS_TOTAL_HELP = "Total number of maintenance runs"; + + private static final String DURATION_SECONDS_NAME = "ice_maintenance_duration_seconds"; + private static final String DURATION_SECONDS_HELP = "Duration of maintenance run in seconds"; + + private static final String IN_PROGRESS_NAME = "ice_maintenance_in_progress"; + private static final String IN_PROGRESS_HELP = + "Whether maintenance is currently running (1 = running, 0 = idle)"; + + private static final String LAST_RUN_TIMESTAMP_NAME = "ice_maintenance_last_run_timestamp"; + private static final String LAST_RUN_TIMESTAMP_HELP = + "Unix timestamp of the last maintenance run"; + + private static final String START_TIMESTAMP_NAME = "ice_maintenance_start_timestamp"; + private static final String START_TIMESTAMP_HELP = + "Unix timestamp when current maintenance started (0 if not running)"; + + private static final String SKIPPED_TOTAL_NAME = "ice_maintenance_skipped_total"; + private static final String SKIPPED_TOTAL_HELP = + "Times maintenance was skipped (already in maintenance mode)"; + + private static final String ORPHAN_FILES_FOUND_TOTAL_NAME = + "ice_maintenance_orphan_files_found_total"; + private static final String ORPHAN_FILES_FOUND_TOTAL_HELP = "Total orphaned files discovered"; + + private static final String ORPHAN_FILES_DELETED_TOTAL_NAME = + "ice_maintenance_orphan_files_deleted_total"; + private static final String ORPHAN_FILES_DELETED_TOTAL_HELP = + "Total orphaned files successfully deleted"; + + private static final String ORPHAN_FILES_EXCLUDED_TOTAL_NAME = + "ice_maintenance_orphan_files_excluded_total"; + private static final String ORPHAN_FILES_EXCLUDED_TOTAL_HELP = "Files excluded by whitelist"; + + private static final String ORPHAN_DELETE_FAILURES_TOTAL_NAME = + "ice_maintenance_orphan_delete_failures_total"; + private static final String ORPHAN_DELETE_FAILURES_TOTAL_HELP = "Files that failed to delete"; + + private static final String COMPACTION_FILES_MERGED_TOTAL_NAME = + "ice_maintenance_compaction_files_merged_total"; + private static final String COMPACTION_FILES_MERGED_TOTAL_HELP = "Total input files merged"; + + private static final String COMPACTION_OUTPUT_FILES_TOTAL_NAME = + "ice_maintenance_compaction_output_files_total"; + private static final String COMPACTION_OUTPUT_FILES_TOTAL_HELP = + "Total output files produced after merge"; + + private static final String COMPACTION_BYTES_READ_TOTAL_NAME = + "ice_maintenance_compaction_bytes_read_total"; + private static final String COMPACTION_BYTES_READ_TOTAL_HELP = + "Total bytes read during compaction"; + + private static final String COMPACTION_BYTES_WRITTEN_TOTAL_NAME = + "ice_maintenance_compaction_bytes_written_total"; + private static final String COMPACTION_BYTES_WRITTEN_TOTAL_HELP = + "Total bytes written during compaction"; + + private static final double[] DURATION_BUCKETS = { + 0.1, 0.5, 1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600 + }; + + // General + private final Counter runsTotal; + private final Histogram durationSeconds; + private final Gauge inProgress; + private final Gauge lastRunTimestamp; + private final Gauge startTimestamp; + private final Counter skippedTotal; + + // Orphan Cleanup + private final Counter orphanFilesFoundTotal; + private final Counter orphanFilesDeletedTotal; + private final Counter orphanFilesExcludedTotal; + private final Counter orphanDeleteFailuresTotal; + + // Data Compaction + private final Counter compactionFilesMergedTotal; + private final Counter compactionOutputFilesTotal; + private final Counter compactionBytesReadTotal; + private final Counter compactionBytesWrittenTotal; + + /** Returns the singleton instance of the metrics. */ + public static MaintenanceMetrics getInstance() { + return Holder.INSTANCE; + } + + private MaintenanceMetrics() { + // General maintenance metrics + this.runsTotal = + Counter.builder() + .name(RUNS_TOTAL_NAME) + .help(RUNS_TOTAL_HELP) + .labelNames(STATUS_LABELS) + .register(); + + this.durationSeconds = + Histogram.builder() + .name(DURATION_SECONDS_NAME) + .help(DURATION_SECONDS_HELP) + .classicUpperBounds(DURATION_BUCKETS) + .register(); + + this.inProgress = Gauge.builder().name(IN_PROGRESS_NAME).help(IN_PROGRESS_HELP).register(); + + this.lastRunTimestamp = + Gauge.builder() + .name(LAST_RUN_TIMESTAMP_NAME) + .help(LAST_RUN_TIMESTAMP_HELP) + .labelNames(STATUS_LABELS) + .register(); + + this.startTimestamp = + Gauge.builder().name(START_TIMESTAMP_NAME).help(START_TIMESTAMP_HELP).register(); + + this.skippedTotal = + Counter.builder().name(SKIPPED_TOTAL_NAME).help(SKIPPED_TOTAL_HELP).register(); + + // Orphan cleanup metrics + this.orphanFilesFoundTotal = + Counter.builder() + .name(ORPHAN_FILES_FOUND_TOTAL_NAME) + .help(ORPHAN_FILES_FOUND_TOTAL_HELP) + .labelNames(LABEL_TABLE) + .register(); + + this.orphanFilesDeletedTotal = + Counter.builder() + .name(ORPHAN_FILES_DELETED_TOTAL_NAME) + .help(ORPHAN_FILES_DELETED_TOTAL_HELP) + .labelNames(LABEL_TABLE) + .register(); + + this.orphanFilesExcludedTotal = + Counter.builder() + .name(ORPHAN_FILES_EXCLUDED_TOTAL_NAME) + .help(ORPHAN_FILES_EXCLUDED_TOTAL_HELP) + .labelNames(LABEL_TABLE) + .register(); + + this.orphanDeleteFailuresTotal = + Counter.builder() + .name(ORPHAN_DELETE_FAILURES_TOTAL_NAME) + .help(ORPHAN_DELETE_FAILURES_TOTAL_HELP) + .labelNames(LABEL_TABLE) + .register(); + + // Data compaction metrics + this.compactionFilesMergedTotal = + Counter.builder() + .name(COMPACTION_FILES_MERGED_TOTAL_NAME) + .help(COMPACTION_FILES_MERGED_TOTAL_HELP) + .labelNames(LABEL_TABLE) + .register(); + + this.compactionOutputFilesTotal = + Counter.builder() + .name(COMPACTION_OUTPUT_FILES_TOTAL_NAME) + .help(COMPACTION_OUTPUT_FILES_TOTAL_HELP) + .labelNames(LABEL_TABLE) + .register(); + + this.compactionBytesReadTotal = + Counter.builder() + .name(COMPACTION_BYTES_READ_TOTAL_NAME) + .help(COMPACTION_BYTES_READ_TOTAL_HELP) + .labelNames(LABEL_TABLE) + .register(); + + this.compactionBytesWrittenTotal = + Counter.builder() + .name(COMPACTION_BYTES_WRITTEN_TOTAL_NAME) + .help(COMPACTION_BYTES_WRITTEN_TOTAL_HELP) + .labelNames(LABEL_TABLE) + .register(); + + logger.info("Maintenance Prometheus metrics initialized"); + } + + public void recordMaintenanceStarted() { + inProgress.set(1.0); + startTimestamp.set(System.currentTimeMillis() / 1000.0); + } + + public void recordMaintenanceCompleted(boolean success, double durationSecs) { + String status = success ? "success" : "failure"; + runsTotal.labelValues(status).inc(); + durationSeconds.observe(durationSecs); + lastRunTimestamp.labelValues(status).set(System.currentTimeMillis() / 1000.0); + inProgress.set(0.0); + startTimestamp.set(0.0); + } + + public void recordMaintenanceSkipped() { + skippedTotal.inc(); + } + + public void recordOrphanFilesFound(String table, int count) { + orphanFilesFoundTotal.labelValues(table).inc(count); + } + + public void recordOrphanFilesDeleted(String table, int count) { + orphanFilesDeletedTotal.labelValues(table).inc(count); + } + + public void recordOrphanFilesExcluded(String table, int count) { + orphanFilesExcludedTotal.labelValues(table).inc(count); + } + + public void recordOrphanDeleteFailure(String table, int count) { + orphanDeleteFailuresTotal.labelValues(table).inc(count); + } + + public void recordCompactionFilesMerged(String table, int count) { + compactionFilesMergedTotal.labelValues(table).inc(count); + } + + public void recordCompactionOutputFile(String table) { + compactionOutputFilesTotal.labelValues(table).inc(); + } + + public void recordCompactionBytesRead(String table, long bytes) { + compactionBytesReadTotal.labelValues(table).inc(bytes); + } + + public void recordCompactionBytesWritten(String table, long bytes) { + compactionBytesWrittenTotal.labelValues(table).inc(bytes); + } +} diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/PrometheusMetricsReporter.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/PrometheusMetricsReporter.java new file mode 100644 index 0000000..3dd7964 --- /dev/null +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/metrics/PrometheusMetricsReporter.java @@ -0,0 +1,572 @@ +/* + * Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.altinity.ice.rest.catalog.internal.metrics; + +import static com.altinity.ice.rest.catalog.internal.metrics.IcebergMetricNames.*; + +import io.prometheus.metrics.core.datapoints.DistributionDataPoint; +import io.prometheus.metrics.core.metrics.Counter; +import io.prometheus.metrics.core.metrics.Histogram; +import java.util.Arrays; +import java.util.concurrent.TimeUnit; +import org.apache.iceberg.metrics.CommitReport; +import org.apache.iceberg.metrics.MetricsReport; +import org.apache.iceberg.metrics.MetricsReporter; +import org.apache.iceberg.metrics.ScanReport; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A Prometheus-based metrics reporter for Iceberg operations. This reporter exposes Iceberg scan + * and commit metrics via Prometheus, making them available at the /metrics endpoint. + * + *

This class implements {@link MetricsReporter} to follow Iceberg's standard metrics reporting + * pattern, allowing it to be used both as a server-side reporter (receiving metrics via REST) and + * potentially as a client-side reporter via catalog configuration. + * + *

This class uses a singleton pattern because Prometheus metrics can only be registered once per + * JVM. + * + *

All duration metrics use Histograms instead of Summaries to allow aggregation across multiple + * instances in distributed deployments. + * + * @see IcebergMetricNames for metric names and help strings + * @see Iceberg Metrics + * Reporting + */ +public class PrometheusMetricsReporter implements MetricsReporter { + + private static final Logger logger = LoggerFactory.getLogger(PrometheusMetricsReporter.class); + + private static class Holder { + private static final PrometheusMetricsReporter INSTANCE = new PrometheusMetricsReporter(); + } + + // Scan metrics counters + private final Counter scansTotal; + private final Counter scanResultDataFiles; + private final Counter scanResultDeleteFiles; + private final Counter scanIndexedDeleteFiles; + private final Counter scanPositionalDeleteFiles; + private final Counter scanEqualityDeleteFiles; + private final Counter scanTotalDataManifests; + private final Counter scanTotalDeleteManifests; + private final Counter scanScannedDataManifests; + private final Counter scanSkippedDataManifests; + private final Counter scanTotalFileSizeBytes; + private final Counter scanTotalDeleteFileSizeBytes; + + // Scan timing metrics (Histogram for aggregation across instances) + private final Histogram scanPlanningDuration; + private final Histogram scanDataFilesPerScan; + + // Commit metrics counters + private final Counter commitsTotal; + private final Counter commitAddedDataFiles; + private final Counter commitRemovedDataFiles; + private final Counter commitAddedDeleteFiles; + private final Counter commitRemovedDeleteFiles; + private final Counter commitAddedRecords; + private final Counter commitRemovedRecords; + private final Counter commitAddedEqualityDeletes; + private final Counter commitTotalFilesSizeBytes; + + // Commit timing metrics (Histogram for aggregation across instances) + private final Histogram commitDuration; + + // Error counter + private final Counter metricsReportErrors; + + // Status metric to confirm reporter is active + private final Counter metricsReporterActive; + + // Table-level metrics + private final Counter tableSnapshotsTotal; + private final Counter tableSchemaUpdatesTotal; + + /** Returns the singleton instance of the metrics reporter. */ + public static PrometheusMetricsReporter getInstance() { + return Holder.INSTANCE; + } + + private PrometheusMetricsReporter() { + // Status metric - incremented once to confirm reporter is active and visible in /metrics + this.metricsReporterActive = + Counter.builder().name(REPORTER_ACTIVE_NAME).help(REPORTER_ACTIVE_HELP).register(); + this.metricsReporterActive.inc(); // Make it visible immediately + + // Error counter + this.metricsReportErrors = + Counter.builder() + .name(REPORT_ERRORS_NAME) + .help(REPORT_ERRORS_HELP) + .labelNames(LABEL_TYPE) + .register(); + + // Scan metrics + this.scansTotal = + Counter.builder() + .name(SCANS_TOTAL_NAME) + .help(SCANS_TOTAL_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanResultDataFiles = + Counter.builder() + .name(SCAN_RESULT_DATA_FILES_NAME) + .help(SCAN_RESULT_DATA_FILES_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanResultDeleteFiles = + Counter.builder() + .name(SCAN_RESULT_DELETE_FILES_NAME) + .help(SCAN_RESULT_DELETE_FILES_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanIndexedDeleteFiles = + Counter.builder() + .name(SCAN_INDEXED_DELETE_FILES_NAME) + .help(SCAN_INDEXED_DELETE_FILES_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanPositionalDeleteFiles = + Counter.builder() + .name(SCAN_POSITIONAL_DELETE_FILES_NAME) + .help(SCAN_POSITIONAL_DELETE_FILES_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanEqualityDeleteFiles = + Counter.builder() + .name(SCAN_EQUALITY_DELETE_FILES_NAME) + .help(SCAN_EQUALITY_DELETE_FILES_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanTotalDataManifests = + Counter.builder() + .name(SCAN_TOTAL_DATA_MANIFESTS_NAME) + .help(SCAN_TOTAL_DATA_MANIFESTS_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanTotalDeleteManifests = + Counter.builder() + .name(SCAN_TOTAL_DELETE_MANIFESTS_NAME) + .help(SCAN_TOTAL_DELETE_MANIFESTS_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanScannedDataManifests = + Counter.builder() + .name(SCAN_SCANNED_DATA_MANIFESTS_NAME) + .help(SCAN_SCANNED_DATA_MANIFESTS_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanSkippedDataManifests = + Counter.builder() + .name(SCAN_SKIPPED_DATA_MANIFESTS_NAME) + .help(SCAN_SKIPPED_DATA_MANIFESTS_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanTotalFileSizeBytes = + Counter.builder() + .name(SCAN_TOTAL_FILE_SIZE_BYTES_NAME) + .help(SCAN_TOTAL_FILE_SIZE_BYTES_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.scanTotalDeleteFileSizeBytes = + Counter.builder() + .name(SCAN_TOTAL_DELETE_FILE_SIZE_BYTES_NAME) + .help(SCAN_TOTAL_DELETE_FILE_SIZE_BYTES_HELP) + .labelNames(SCAN_LABELS) + .register(); + + // Scan timing - using Histogram for aggregation across instances + this.scanPlanningDuration = + Histogram.builder() + .name(SCAN_PLANNING_DURATION_NAME) + .help(SCAN_PLANNING_DURATION_HELP) + .labelNames(SCAN_LABELS) + .classicUpperBounds(DURATION_BUCKETS) + .register(); + + this.scanDataFilesPerScan = + Histogram.builder() + .name(SCAN_DATA_FILES_PER_SCAN_NAME) + .help(SCAN_DATA_FILES_PER_SCAN_HELP) + .labelNames(SCAN_LABELS) + .classicExponentialUpperBounds(1, 2, 10) + .register(); + + // Commit metrics + this.commitsTotal = + Counter.builder() + .name(COMMITS_TOTAL_NAME) + .help(COMMITS_TOTAL_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + this.commitAddedDataFiles = + Counter.builder() + .name(COMMIT_ADDED_DATA_FILES_NAME) + .help(COMMIT_ADDED_DATA_FILES_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + this.commitRemovedDataFiles = + Counter.builder() + .name(COMMIT_REMOVED_DATA_FILES_NAME) + .help(COMMIT_REMOVED_DATA_FILES_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + this.commitAddedDeleteFiles = + Counter.builder() + .name(COMMIT_ADDED_DELETE_FILES_NAME) + .help(COMMIT_ADDED_DELETE_FILES_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + this.commitRemovedDeleteFiles = + Counter.builder() + .name(COMMIT_REMOVED_DELETE_FILES_NAME) + .help(COMMIT_REMOVED_DELETE_FILES_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + this.commitAddedRecords = + Counter.builder() + .name(COMMIT_ADDED_RECORDS_NAME) + .help(COMMIT_ADDED_RECORDS_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + this.commitRemovedRecords = + Counter.builder() + .name(COMMIT_REMOVED_RECORDS_NAME) + .help(COMMIT_REMOVED_RECORDS_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + this.commitAddedEqualityDeletes = + Counter.builder() + .name(COMMIT_ADDED_EQUALITY_DELETES_NAME) + .help(COMMIT_ADDED_EQUALITY_DELETES_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + this.commitTotalFilesSizeBytes = + Counter.builder() + .name(COMMIT_TOTAL_FILES_SIZE_BYTES_NAME) + .help(COMMIT_TOTAL_FILES_SIZE_BYTES_HELP) + .labelNames(COMMIT_LABELS) + .register(); + + // Commit timing - using Histogram for aggregation across instances + this.commitDuration = + Histogram.builder() + .name(COMMIT_DURATION_NAME) + .help(COMMIT_DURATION_HELP) + .labelNames(COMMIT_LABELS) + .classicUpperBounds(DURATION_BUCKETS) + .register(); + + // Table-level metrics for snapshots and schema evolution + this.tableSnapshotsTotal = + Counter.builder() + .name(TABLE_SNAPSHOTS_TOTAL_NAME) + .help(TABLE_SNAPSHOTS_TOTAL_HELP) + .labelNames(SCAN_LABELS) + .register(); + + this.tableSchemaUpdatesTotal = + Counter.builder() + .name(TABLE_SCHEMA_UPDATES_TOTAL_NAME) + .help(TABLE_SCHEMA_UPDATES_TOTAL_HELP) + .labelNames(SCAN_LABELS) + .register(); + + logger.info("Prometheus Iceberg metrics reporter initialized"); + } + + /** + * Report metrics from an Iceberg MetricsReport. + * + * @param catalogName the catalog name for multi-catalog deployments + * @param report the metrics report to process + */ + public void report(String catalogName, MetricsReport report) { + logger.debug("Reporting metrics report: catalogName: {}, report: {}", catalogName, report); + if (report == null) { + return; + } + + String catalog = catalogName != null ? catalogName : "default"; + + try { + if (report instanceof ScanReport) { + reportScanMetrics(catalog, (ScanReport) report); + } else if (report instanceof CommitReport) { + reportCommitMetrics(catalog, (CommitReport) report); + } else { + logger.debug("Unknown metrics report type: {}", report.getClass().getName()); + } + } catch (Exception e) { + String reportType = report.getClass().getSimpleName(); + logger.warn("Error processing {} metrics report: {}", reportType, e.getMessage()); + metricsReportErrors.labelValues(reportType).inc(); + } + } + + /** + * Report metrics from an Iceberg MetricsReport using default catalog name. + * + *

This method implements {@link MetricsReporter#report(MetricsReport)}. + * + * @param report the metrics report to process + */ + @Override + public void report(MetricsReport report) { + logger.debug("Reporting metrics report: {}", report); + report(null, report); + } + + private void reportScanMetrics(String catalog, ScanReport report) { + String fullTableName = report.tableName(); + String namespace = extractNamespace(fullTableName); + String table = extractTableName(fullTableName); + + logger.debug( + "Recording scan metrics for catalog: {}, namespace: {}, table: {}", + catalog, + namespace, + table); + + scansTotal.labelValues(catalog, namespace, table).inc(); + + var metrics = report.scanMetrics(); + if (metrics == null) { + return; + } + + // File counts + if (metrics.resultDataFiles() != null) { + long count = metrics.resultDataFiles().value(); + scanResultDataFiles.labelValues(catalog, namespace, table).inc(count); + scanDataFilesPerScan.labelValues(catalog, namespace, table).observe(count); + } + + if (metrics.resultDeleteFiles() != null) { + scanResultDeleteFiles + .labelValues(catalog, namespace, table) + .inc(metrics.resultDeleteFiles().value()); + } + + // Additional delete file metrics (if available in the Iceberg version) + if (metrics.indexedDeleteFiles() != null) { + scanIndexedDeleteFiles + .labelValues(catalog, namespace, table) + .inc(metrics.indexedDeleteFiles().value()); + } + + if (metrics.positionalDeleteFiles() != null) { + scanPositionalDeleteFiles + .labelValues(catalog, namespace, table) + .inc(metrics.positionalDeleteFiles().value()); + } + + if (metrics.equalityDeleteFiles() != null) { + scanEqualityDeleteFiles + .labelValues(catalog, namespace, table) + .inc(metrics.equalityDeleteFiles().value()); + } + + // Manifest counts + if (metrics.totalDataManifests() != null) { + scanTotalDataManifests + .labelValues(catalog, namespace, table) + .inc(metrics.totalDataManifests().value()); + } + + if (metrics.totalDeleteManifests() != null) { + scanTotalDeleteManifests + .labelValues(catalog, namespace, table) + .inc(metrics.totalDeleteManifests().value()); + } + + if (metrics.scannedDataManifests() != null) { + scanScannedDataManifests + .labelValues(catalog, namespace, table) + .inc(metrics.scannedDataManifests().value()); + } + + if (metrics.skippedDataManifests() != null) { + scanSkippedDataManifests + .labelValues(catalog, namespace, table) + .inc(metrics.skippedDataManifests().value()); + } + + // File sizes + if (metrics.totalFileSizeInBytes() != null) { + scanTotalFileSizeBytes + .labelValues(catalog, namespace, table) + .inc(metrics.totalFileSizeInBytes().value()); + } + + if (metrics.totalDeleteFileSizeInBytes() != null) { + scanTotalDeleteFileSizeBytes + .labelValues(catalog, namespace, table) + .inc(metrics.totalDeleteFileSizeInBytes().value()); + } + + // Timing metrics + if (metrics.totalPlanningDuration() != null) { + observeDuration( + scanPlanningDuration.labelValues(catalog, namespace, table), + metrics.totalPlanningDuration().totalDuration()); + } + } + + private void reportCommitMetrics(String catalog, CommitReport report) { + String fullTableName = report.tableName(); + String namespace = extractNamespace(fullTableName); + String table = extractTableName(fullTableName); + String operation = report.operation() != null ? report.operation() : "unknown"; + + logger.debug( + "Recording commit metrics for catalog: {}, namespace: {}, table: {}, operation: {}", + catalog, + namespace, + table, + operation); + + commitsTotal.labelValues(catalog, namespace, table, operation).inc(); + + var metrics = report.commitMetrics(); + if (metrics == null) { + return; + } + + if (metrics.addedDataFiles() != null) { + commitAddedDataFiles + .labelValues(catalog, namespace, table, operation) + .inc(metrics.addedDataFiles().value()); + } + + if (metrics.removedDataFiles() != null) { + commitRemovedDataFiles + .labelValues(catalog, namespace, table, operation) + .inc(metrics.removedDataFiles().value()); + } + + if (metrics.addedDeleteFiles() != null) { + commitAddedDeleteFiles + .labelValues(catalog, namespace, table, operation) + .inc(metrics.addedDeleteFiles().value()); + } + + if (metrics.removedDeleteFiles() != null) { + commitRemovedDeleteFiles + .labelValues(catalog, namespace, table, operation) + .inc(metrics.removedDeleteFiles().value()); + } + + if (metrics.addedRecords() != null) { + commitAddedRecords + .labelValues(catalog, namespace, table, operation) + .inc(metrics.addedRecords().value()); + } + + if (metrics.removedRecords() != null) { + commitRemovedRecords + .labelValues(catalog, namespace, table, operation) + .inc(metrics.removedRecords().value()); + } + + if (metrics.addedEqualityDeletes() != null) { + commitAddedEqualityDeletes + .labelValues(catalog, namespace, table, operation) + .inc(metrics.addedEqualityDeletes().value()); + } + + if (metrics.totalFilesSizeInBytes() != null) { + commitTotalFilesSizeBytes + .labelValues(catalog, namespace, table, operation) + .inc(metrics.totalFilesSizeInBytes().value()); + } + + if (metrics.totalDuration() != null) { + observeDuration( + commitDuration.labelValues(catalog, namespace, table, operation), + metrics.totalDuration().totalDuration()); + } + + tableSnapshotsTotal.labelValues(catalog, namespace, table).inc(); + } + + /** + * Record a schema update for a table. Called from RESTCatalogAdapter when UpdateTableRequest + * contains schema-related MetadataUpdate objects (AddSchema, SetCurrentSchema, etc.). + */ + public void recordSchemaUpdate(String catalog, String namespace, String table) { + tableSchemaUpdatesTotal.labelValues(catalog, namespace, table).inc(); + } + + private void observeDuration(DistributionDataPoint dataPoint, java.time.Duration duration) { + if (duration != null) { + double seconds = duration.toNanos() / (double) TimeUnit.SECONDS.toNanos(1); + dataPoint.observe(seconds); + } + } + + /** + * Extracts the namespace from a full table name. Handles various formats: + * + *

    + *
  • "namespace.table" -> "namespace" + *
  • "db.schema.table" -> "db.schema" + *
  • "table" -> "default" + *
+ */ + private String extractNamespace(String fullTableName) { + if (fullTableName == null || fullTableName.isEmpty()) { + return "unknown"; + } + String[] parts = fullTableName.split("\\."); + if (parts.length > 1) { + return String.join(".", Arrays.copyOf(parts, parts.length - 1)); + } + return "default"; + } + + /** + * Extracts the table name from a full table name. Handles various formats: + * + *
    + *
  • "namespace.table" -> "table" + *
  • "db.schema.table" -> "table" + *
  • "table" -> "table" + *
+ */ + private String extractTableName(String fullTableName) { + if (fullTableName == null || fullTableName.isEmpty()) { + return "unknown"; + } + String[] parts = fullTableName.split("\\."); + return parts[parts.length - 1]; + } +} diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/rest/RESTCatalogAdapter.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/rest/RESTCatalogAdapter.java index fa3b204..bd69121 100644 --- a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/rest/RESTCatalogAdapter.java +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/rest/RESTCatalogAdapter.java @@ -24,6 +24,8 @@ import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; import com.altinity.ice.rest.catalog.internal.auth.Session; +import com.altinity.ice.rest.catalog.internal.metrics.CatalogMetrics; +import com.altinity.ice.rest.catalog.internal.metrics.PrometheusMetricsReporter; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -31,6 +33,7 @@ import java.util.stream.Collectors; import org.apache.iceberg.BaseTable; import org.apache.iceberg.BaseTransaction; +import org.apache.iceberg.MetadataUpdate; import org.apache.iceberg.Table; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableOperations; @@ -125,8 +128,9 @@ public T handle( case CREATE_NAMESPACE: if (asNamespaceCatalog != null) { CreateNamespaceRequest request = castRequest(CreateNamespaceRequest.class, requestBody); - return castResponse( - responseType, CatalogHandlers.createNamespace(asNamespaceCatalog, request)); + var response = CatalogHandlers.createNamespace(asNamespaceCatalog, request); + CatalogMetrics.getInstance().recordNamespaceCreated(catalog.name()); + return castResponse(responseType, response); } break; @@ -148,6 +152,7 @@ public T handle( case DROP_NAMESPACE: if (asNamespaceCatalog != null) { CatalogHandlers.dropNamespace(asNamespaceCatalog, namespaceFromPathVars(vars)); + CatalogMetrics.getInstance().recordNamespaceDropped(catalog.name()); return null; } break; @@ -185,8 +190,9 @@ public T handle( return castResponse( responseType, CatalogHandlers.stageTableCreate(catalog, namespace, request)); } else { - return castResponse( - responseType, CatalogHandlers.createTable(catalog, namespace, request)); + var response = CatalogHandlers.createTable(catalog, namespace, request); + CatalogMetrics.getInstance().recordTableCreated(catalog.name()); + return castResponse(responseType, response); } } @@ -197,6 +203,7 @@ public T handle( } else { CatalogHandlers.dropTable(catalog, tableIdentFromPathVars(vars)); } + CatalogMetrics.getInstance().recordTableDropped(catalog.name()); return null; } @@ -225,7 +232,21 @@ public T handle( { TableIdentifier ident = tableIdentFromPathVars(vars); UpdateTableRequest request = castRequest(UpdateTableRequest.class, requestBody); - return castResponse(responseType, CatalogHandlers.updateTable(catalog, ident, request)); + var response = CatalogHandlers.updateTable(catalog, ident, request); + + // Check if this update contains schema changes + boolean hasSchemaUpdate = + request.updates().stream() + .anyMatch( + update -> + update instanceof MetadataUpdate.AddSchema + || update instanceof MetadataUpdate.SetCurrentSchema); + if (hasSchemaUpdate) { + PrometheusMetricsReporter.getInstance() + .recordSchemaUpdate(catalog.name(), ident.namespace().toString(), ident.name()); + } + + return castResponse(responseType, response); } case RENAME_TABLE: @@ -236,9 +257,15 @@ public T handle( } case REPORT_METRICS: - // nothing to do here other than checking that we're getting the correct request - castRequest(ReportMetricsRequest.class, requestBody); - return null; + { + ReportMetricsRequest request = castRequest(ReportMetricsRequest.class, requestBody); + PrometheusMetricsReporter metricsReporter = PrometheusMetricsReporter.getInstance(); + if (metricsReporter != null && request.report() != null) { + String catalogName = catalog.name(); + metricsReporter.report(catalogName, request.report()); + } + return null; + } case COMMIT_TRANSACTION: { diff --git a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/rest/RESTCatalogServlet.java b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/rest/RESTCatalogServlet.java index 3f8d0d4..4b2563e 100644 --- a/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/rest/RESTCatalogServlet.java +++ b/ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/rest/RESTCatalogServlet.java @@ -19,6 +19,7 @@ package com.altinity.ice.rest.catalog.internal.rest; import com.altinity.ice.rest.catalog.internal.auth.Session; +import com.altinity.ice.rest.catalog.internal.metrics.HttpMetrics; import jakarta.servlet.http.HttpServlet; import jakarta.servlet.http.HttpServletRequest; import jakarta.servlet.http.HttpServletResponse; @@ -72,9 +73,11 @@ public class RESTCatalogServlet extends HttpServlet { .buildOrThrow(); private final RESTCatalogHandler restCatalogAdapter; + private final HttpMetrics httpMetrics; public RESTCatalogServlet(RESTCatalogHandler restCatalogAdapter) { this.restCatalogAdapter = restCatalogAdapter; + this.httpMetrics = HttpMetrics.getInstance(); } protected void handle(HttpServletRequest request, HttpServletResponse response) @@ -84,70 +87,87 @@ protected void handle(HttpServletRequest request, HttpServletResponse response) Pair> routeContext = Route.from(method, path); if (routeContext == null) { - response.setStatus(HttpServletResponse.SC_BAD_REQUEST); - var res = - ErrorResponse.builder() - .responseCode(400) - .withType("BadRequestException") - .withMessage(String.format("No route for %s %s", method, path)) - .build(); - RESTObjectMapper.mapper().writeValue(response.getWriter(), res); + // Track unknown route requests + try (var timer = httpMetrics.startRequest(method.name(), "UNKNOWN")) { + timer.setStatusCode(HttpServletResponse.SC_BAD_REQUEST); + response.setStatus(HttpServletResponse.SC_BAD_REQUEST); + var res = + ErrorResponse.builder() + .responseCode(400) + .withType("BadRequestException") + .withMessage(String.format("No route for %s %s", method, path)) + .build(); + byte[] responseBytes = RESTObjectMapper.mapper().writeValueAsBytes(res); + timer.setResponseSize(responseBytes.length); + response.getOutputStream().write(responseBytes); + } return; } - Session session = Session.from(request); - String userToLog = ""; - if (session != null) { - userToLog = "@" + session.uid() + " "; - } - logger.info("{}{} {}", userToLog, method, path); - Route route = routeContext.first(); - Map pathParams = routeContext.second(); - - // FIXME: this should be in RESTCatalogAdapter, not here - Object requestBody = null; - if (route.requestClass() != null) { - requestBody = RESTObjectMapper.mapper().readValue(request.getReader(), route.requestClass()); - } else if (route == Route.TOKENS) { - requestBody = RESTUtil.decodeFormData(CharStreams.toString(request.getReader())); - } - Map queryParams = - request.getParameterMap().entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue()[0])); - - Map params = - ImmutableMap.builder().putAll(pathParams).putAll(queryParams).build(); - - RESTResponse responseBody; - try { - responseBody = - restCatalogAdapter.handle(session, route, params, requestBody, route.responseClass()); - } catch (Exception e) { - ErrorResponse error = - ErrorResponse.builder() - .responseCode(STATUS_CODE_BY_EXCEPTION.getOrDefault(e.getClass(), 500)) - .withType(e.getClass().getSimpleName()) - .withMessage(e.getMessage()) - .build(); - - if (error.code() < 500) { - logger.warn("{}{} {}: {}", userToLog, method, path, e.getMessage()); - } else { - logger.error("{}{} {}", userToLog, method, path, e); + // Track request with metrics + try (var timer = httpMetrics.startRequest(method.name(), route.name())) { + Session session = Session.from(request); + String userToLog = ""; + if (session != null) { + userToLog = "@" + session.uid() + " "; } + logger.info("{}{} {}", userToLog, method, path); - response.setStatus(error.code()); - response.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); - RESTObjectMapper.mapper().writeValue(response.getWriter(), error); - return; - } + Map pathParams = routeContext.second(); - response.setStatus(HttpServletResponse.SC_OK); - response.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); - if (responseBody != null) { - RESTObjectMapper.mapper().writeValue(response.getWriter(), responseBody); + // FIXME: this should be in RESTCatalogAdapter, not here + Object requestBody = null; + if (route.requestClass() != null) { + requestBody = + RESTObjectMapper.mapper().readValue(request.getReader(), route.requestClass()); + } else if (route == Route.TOKENS) { + requestBody = RESTUtil.decodeFormData(CharStreams.toString(request.getReader())); + } + + Map queryParams = + request.getParameterMap().entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue()[0])); + + Map params = + ImmutableMap.builder().putAll(pathParams).putAll(queryParams).build(); + + RESTResponse responseBody; + try { + responseBody = + restCatalogAdapter.handle(session, route, params, requestBody, route.responseClass()); + } catch (Exception e) { + ErrorResponse error = + ErrorResponse.builder() + .responseCode(STATUS_CODE_BY_EXCEPTION.getOrDefault(e.getClass(), 500)) + .withType(e.getClass().getSimpleName()) + .withMessage(e.getMessage()) + .build(); + + if (error.code() < 500) { + logger.warn("{}{} {}: {}", userToLog, method, path, e.getMessage()); + } else { + logger.error("{}{} {}", userToLog, method, path, e); + } + + timer.setStatusCode(error.code()); + response.setStatus(error.code()); + response.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); + byte[] errorBytes = RESTObjectMapper.mapper().writeValueAsBytes(error); + timer.setResponseSize(errorBytes.length); + response.getOutputStream().write(errorBytes); + return; + } + + timer.setStatusCode(HttpServletResponse.SC_OK); + response.setStatus(HttpServletResponse.SC_OK); + response.setHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); + if (responseBody != null) { + byte[] responseBytes = RESTObjectMapper.mapper().writeValueAsBytes(responseBody); + timer.setResponseSize(responseBytes.length); + response.getOutputStream().write(responseBytes); + } } } diff --git a/ice/src/main/java/com/altinity/ice/cli/Main.java b/ice/src/main/java/com/altinity/ice/cli/Main.java index f6143ad..aff0355 100644 --- a/ice/src/main/java/com/altinity/ice/cli/Main.java +++ b/ice/src/main/java/com/altinity/ice/cli/Main.java @@ -343,6 +343,10 @@ void insert( names = {"--watch"}, description = "Event queue. Supported: AWS SQS") String watch, + @CommandLine.Option( + names = {"--watch-endpoint"}, + description = "Custom SQS endpoint URL (e.g. http://localhost:9324 for LocalStack)") + String watchEndpoint, @CommandLine.Option( names = {"--watch-fire-once"}, description = "") @@ -420,6 +424,7 @@ void insert( if (!watchMode) { Insert.run(catalog, tableId, dataFiles, options); } else { + boolean metricsEnabled = false; if (!Strings.isNullOrEmpty(watchDebugAddr)) { JvmMetrics.builder().register(); @@ -432,10 +437,19 @@ void insert( throw new RuntimeException(e); // TODO: find a better one } logger.info("Serving http://{}/{metrics,healtz,livez,readyz}", debugHostAndPort); + metricsEnabled = true; } InsertWatch.run( - catalog, tableId, dataFiles, watch, watchFireOnce, createTableIfNotExists, options); + catalog, + tableId, + dataFiles, + watch, + watchEndpoint, + watchFireOnce, + createTableIfNotExists, + options, + metricsEnabled); } } } diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/InsertWatch.java b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/InsertWatch.java index bfad50e..5e4f779 100644 --- a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/InsertWatch.java +++ b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/InsertWatch.java @@ -9,11 +9,13 @@ */ package com.altinity.ice.cli.internal.cmd; +import com.altinity.ice.cli.internal.metrics.InsertWatchMetrics; import com.altinity.ice.internal.io.Matcher; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; +import java.net.URI; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; import java.time.Duration; @@ -31,6 +33,7 @@ import org.slf4j.LoggerFactory; import software.amazon.awssdk.core.exception.SdkException; import software.amazon.awssdk.services.sqs.SqsClient; +import software.amazon.awssdk.services.sqs.SqsClientBuilder; import software.amazon.awssdk.services.sqs.model.BatchResultErrorEntry; import software.amazon.awssdk.services.sqs.model.DeleteMessageBatchRequest; import software.amazon.awssdk.services.sqs.model.DeleteMessageBatchRequestEntry; @@ -42,6 +45,7 @@ public class InsertWatch { private static final Logger logger = LoggerFactory.getLogger(InsertWatch.class); private static final ObjectMapper objectMapper = new ObjectMapper(); + private static final String QUEUE_TYPE_SQS = "sqs"; public static void run( RESTCatalog catalog, @@ -52,6 +56,29 @@ public static void run( boolean createTableIfNotExists, Insert.Options options) throws IOException, InterruptedException { + run( + catalog, + nsTable, + input, + sqsQueueURL, + null, + terminateAfterOneBatch, + createTableIfNotExists, + options, + false); + } + + public static void run( + RESTCatalog catalog, + TableIdentifier nsTable, + String[] input, + String sqsQueueURL, + String sqsOverrideEndpoint, + boolean terminateAfterOneBatch, + boolean createTableIfNotExists, + Insert.Options options, + boolean metricsEnabled) + throws IOException, InterruptedException { if (!options.noCopy() || !options.skipDuplicates()) { throw new IllegalArgumentException( @@ -63,8 +90,15 @@ public static void run( } var matchers = Arrays.stream(input).map(Matcher::from).toList(); + logger.info("Watching for files matching: {}", Arrays.toString(input)); - final SqsClient sqs = SqsClient.builder().build(); + // Initialize metrics if enabled + InsertWatchMetrics metrics = metricsEnabled ? InsertWatchMetrics.getInstance() : null; + String tableLabel = nsTable.toString(); + String queueLabel = sqsQueueURL; + String queueType = QUEUE_TYPE_SQS; + + final SqsClient sqs = buildSqsClient(sqsOverrideEndpoint); ReceiveMessageRequest req = ReceiveMessageRequest.builder() .queueUrl(sqsQueueURL) @@ -91,9 +125,16 @@ public static void run( do { List batch = new LinkedList<>(); try { + if (metrics != null) { + metrics.recordPollRequest(tableLabel, queueLabel, queueType); + } var messages = sqs.receiveMessage(req).messages(); batch.addAll(messages); } catch (SdkException e) { + if (metrics != null) { + metrics.recordQueueReceiveError(tableLabel, queueLabel, queueType); + metrics.recordRetryAttempt(tableLabel, queueLabel, queueType); + } if (!e.retryable()) { throw e; // TODO: should we really? } @@ -112,17 +153,28 @@ public static void run( batch.addAll(tailMessages); } while (!tailMessages.isEmpty() && batch.size() < maxBatchSize); + if (metrics != null) { + metrics.recordMessagesReceived(tableLabel, queueLabel, queueType, batch.size()); + } + logger.info("Processing {} message(s)", batch.size()); // FIXME: handle files not found - var insertBatch = filter(batch, matchers); + var insertBatch = filter(batch, matchers, metrics, tableLabel, queueLabel, queueType); if (!insertBatch.isEmpty()) { logger.info("Inserting {}", insertBatch); try { Insert.run(catalog, nsTable, insertBatch.toArray(String[]::new), options); + if (metrics != null) { + metrics.recordFilesInserted(tableLabel, queueLabel, queueType, insertBatch.size()); + metrics.recordTransactionSuccess(tableLabel, queueLabel, queueType); + } } catch (NoSuchTableException e) { if (!createTableIfNotExists) { + if (metrics != null) { + metrics.recordTransactionFailed(tableLabel, queueLabel, queueType); + } throw e; } boolean retryInsert = true; @@ -139,6 +191,9 @@ public static void run( null); } catch (NotFoundException nfe) { if (!options.ignoreNotFound()) { + if (metrics != null) { + metrics.recordTransactionFailed(tableLabel, queueLabel, queueType); + } throw nfe; } logger.info("Table not created ({} don't exist)", insertBatch); @@ -146,16 +201,25 @@ public static void run( } if (retryInsert) { Insert.run(catalog, nsTable, insertBatch.toArray(String[]::new), options); + if (metrics != null) { + metrics.recordFilesInserted( + tableLabel, queueLabel, queueType, insertBatch.size()); + metrics.recordTransactionSuccess(tableLabel, queueLabel, queueType); + } } } } - confirmProcessed(sqs, sqsQueueURL, batch); + confirmProcessed(sqs, sqsQueueURL, batch, metrics, tableLabel, queueLabel, queueType); } catch (InterruptedException e) { // terminate Thread.currentThread().interrupt(); throw new InterruptedException(); } catch (Exception e) { + if (metrics != null) { + metrics.recordTransactionFailed(tableLabel, queueLabel, queueType); + metrics.recordRetryAttempt(tableLabel, queueLabel, queueType); + } Duration delay = backoff.get(); logger.error("Failed to process batch of messages (retry in {})", delay, e); Thread.sleep(delay); @@ -166,7 +230,13 @@ public static void run( } while (!terminateAfterOneBatch); } - private static Collection filter(List messages, Collection matchers) { + private static Collection filter( + List messages, + Collection matchers, + InsertWatchMetrics metrics, + String tableLabel, + String queueLabel, + String queueType) { Collection r = new LinkedHashSet<>(); for (Message message : messages) { // Message body() example: @@ -193,23 +263,41 @@ private static Collection filter(List messages, Collection {}", eventName, target); // s3:ObjectCreated:{Put,Post,Copy,CompleteMultipartUpload} if (eventName.startsWith("ObjectCreated:")) { // TODO: exclude metadata/data dirs by default if (matchers.stream().anyMatch(matcher -> matcher.test(target))) { r.add(target); + if (metrics != null) { + metrics.recordEventMatched(tableLabel, queueLabel, queueType); + } + } else { + logger.info("Target did not match any input pattern: {}", target); + if (metrics != null) { + metrics.recordEventNotMatched(tableLabel, queueLabel, queueType); + } } } else { + if (metrics != null) { + metrics.recordEventSkipped(tableLabel, queueLabel, queueType); + } if (logger.isTraceEnabled()) { logger.trace("Message skipped: {} {}", eventName, target); } @@ -219,7 +307,14 @@ private static Collection filter(List messages, Collection messages) { + private static void confirmProcessed( + SqsClient sqs, + String sqsQueueURL, + List messages, + InsertWatchMetrics metrics, + String tableLabel, + String queueLabel, + String queueType) { int failedCount = 0; int len = messages.size(); for (int i = 0; i < len; i = i + 10) { @@ -228,6 +323,9 @@ private static void confirmProcessed(SqsClient sqs, String sqsQueueURL, List failed = res.failed(); failedCount += failed.size(); + if (metrics != null) { + metrics.recordQueueDeleteError(tableLabel, queueLabel, queueType, failed.size()); + } } } if (failedCount > 0) { @@ -252,4 +350,18 @@ private static DeleteMessageBatchResponse deleteMessageBatch( .toList()) .build()); } + + private static SqsClient buildSqsClient(String sqsOverrideEndpoint) { + SqsClientBuilder builder = SqsClient.builder(); + + // Use explicit endpoint if provided (e.g. for LocalStack) + // Otherwise, AWS SDK will use AWS_ENDPOINT_URL_SQS env var or default AWS endpoints + if (sqsOverrideEndpoint != null && !sqsOverrideEndpoint.isEmpty()) { + URI endpoint = URI.create(sqsOverrideEndpoint); + logger.info("Using custom SQS endpoint: {}", endpoint); + builder.endpointOverride(endpoint); + } + + return builder.build(); + } } diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/metrics/InsertWatchMetrics.java b/ice/src/main/java/com/altinity/ice/cli/internal/metrics/InsertWatchMetrics.java new file mode 100644 index 0000000..483bc46 --- /dev/null +++ b/ice/src/main/java/com/altinity/ice/cli/internal/metrics/InsertWatchMetrics.java @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.altinity.ice.cli.internal.metrics; + +import io.prometheus.metrics.core.metrics.Counter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Prometheus metrics for the InsertWatch (S3 watch) functionality. + * + *

This class uses a singleton pattern because Prometheus metrics can only be registered once per + * JVM. + */ +public class InsertWatchMetrics { + + private static final Logger logger = LoggerFactory.getLogger(InsertWatchMetrics.class); + + private static class Holder { + private static final InsertWatchMetrics INSTANCE = new InsertWatchMetrics(); + } + + // ========================================================================== + // Metric Names + // ========================================================================== + + private static final String LABEL_TABLE = "table"; + private static final String LABEL_QUEUE = "queue"; + private static final String LABEL_QUEUE_TYPE = "queue_type"; + + private static final String[] WATCH_LABELS = {LABEL_TABLE, LABEL_QUEUE, LABEL_QUEUE_TYPE}; + + // Messages/Files processed + private static final String MESSAGES_RECEIVED_TOTAL_NAME = "ice_watch_messages_received_total"; + private static final String MESSAGES_RECEIVED_TOTAL_HELP = + "Total number of SQS messages received"; + + private static final String EVENTS_RECEIVED_TOTAL_NAME = "ice_watch_events_received_total"; + private static final String EVENTS_RECEIVED_TOTAL_HELP = + "Total number of S3 events received (one message may contain multiple events)"; + + private static final String EVENTS_MATCHED_TOTAL_NAME = "ice_watch_events_matched_total"; + private static final String EVENTS_MATCHED_TOTAL_HELP = + "Total number of S3 events that matched the pattern"; + + private static final String EVENTS_NOT_MATCHED_TOTAL_NAME = "ice_watch_events_not_matched_total"; + private static final String EVENTS_NOT_MATCHED_TOTAL_HELP = + "Total number of S3 events that did not match any input pattern"; + + private static final String EVENTS_SKIPPED_TOTAL_NAME = "ice_watch_events_skipped_total"; + private static final String EVENTS_SKIPPED_TOTAL_HELP = + "Total number of S3 events skipped (non-ObjectCreated events)"; + + // Files inserted + private static final String FILES_INSERTED_TOTAL_NAME = "ice_watch_files_inserted_total"; + private static final String FILES_INSERTED_TOTAL_HELP = + "Total number of files successfully inserted into the catalog"; + + // Transactions + private static final String TRANSACTIONS_TOTAL_NAME = "ice_watch_transactions_total"; + private static final String TRANSACTIONS_TOTAL_HELP = + "Total number of insert transactions committed"; + + private static final String TRANSACTIONS_FAILED_TOTAL_NAME = + "ice_watch_transactions_failed_total"; + private static final String TRANSACTIONS_FAILED_TOTAL_HELP = + "Total number of insert transactions that failed"; + + // Retry state + private static final String RETRY_ATTEMPTS_TOTAL_NAME = "ice_watch_retry_attempts_total"; + private static final String RETRY_ATTEMPTS_TOTAL_HELP = + "Total number of retry attempts due to failures"; + + // Queue errors (SQS, Kafka, etc.) + private static final String QUEUE_RECEIVE_ERRORS_TOTAL_NAME = + "ice_watch_queue_receive_errors_total"; + private static final String QUEUE_RECEIVE_ERRORS_TOTAL_HELP = + "Total number of errors when receiving messages from queue"; + + private static final String QUEUE_DELETE_ERRORS_TOTAL_NAME = + "ice_watch_queue_delete_errors_total"; + private static final String QUEUE_DELETE_ERRORS_TOTAL_HELP = + "Total number of errors when deleting/acknowledging messages from queue"; + + // Parse errors + private static final String MESSAGE_PARSE_ERRORS_TOTAL_NAME = + "ice_watch_message_parse_errors_total"; + private static final String MESSAGE_PARSE_ERRORS_TOTAL_HELP = + "Total number of message parsing errors"; + + // Poll requests + private static final String POLL_REQUESTS_TOTAL_NAME = "ice_watch_poll_requests_total"; + private static final String POLL_REQUESTS_TOTAL_HELP = + "Total number of poll requests to the message queue"; + + // ========================================================================== + // Metrics + // ========================================================================== + + private final Counter messagesReceivedTotal; + private final Counter eventsReceivedTotal; + private final Counter eventsMatchedTotal; + private final Counter eventsNotMatchedTotal; + private final Counter eventsSkippedTotal; + private final Counter filesInsertedTotal; + private final Counter transactionsTotal; + private final Counter transactionsFailedTotal; + private final Counter retryAttemptsTotal; + private final Counter queueReceiveErrorsTotal; + private final Counter queueDeleteErrorsTotal; + private final Counter messageParseErrorsTotal; + private final Counter pollRequestsTotal; + + /** Returns the singleton instance of the metrics reporter. */ + public static InsertWatchMetrics getInstance() { + return Holder.INSTANCE; + } + + private InsertWatchMetrics() { + this.messagesReceivedTotal = + Counter.builder() + .name(MESSAGES_RECEIVED_TOTAL_NAME) + .help(MESSAGES_RECEIVED_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.eventsReceivedTotal = + Counter.builder() + .name(EVENTS_RECEIVED_TOTAL_NAME) + .help(EVENTS_RECEIVED_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.eventsMatchedTotal = + Counter.builder() + .name(EVENTS_MATCHED_TOTAL_NAME) + .help(EVENTS_MATCHED_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.eventsNotMatchedTotal = + Counter.builder() + .name(EVENTS_NOT_MATCHED_TOTAL_NAME) + .help(EVENTS_NOT_MATCHED_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.eventsSkippedTotal = + Counter.builder() + .name(EVENTS_SKIPPED_TOTAL_NAME) + .help(EVENTS_SKIPPED_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.filesInsertedTotal = + Counter.builder() + .name(FILES_INSERTED_TOTAL_NAME) + .help(FILES_INSERTED_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.transactionsTotal = + Counter.builder() + .name(TRANSACTIONS_TOTAL_NAME) + .help(TRANSACTIONS_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.transactionsFailedTotal = + Counter.builder() + .name(TRANSACTIONS_FAILED_TOTAL_NAME) + .help(TRANSACTIONS_FAILED_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.retryAttemptsTotal = + Counter.builder() + .name(RETRY_ATTEMPTS_TOTAL_NAME) + .help(RETRY_ATTEMPTS_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.queueReceiveErrorsTotal = + Counter.builder() + .name(QUEUE_RECEIVE_ERRORS_TOTAL_NAME) + .help(QUEUE_RECEIVE_ERRORS_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.queueDeleteErrorsTotal = + Counter.builder() + .name(QUEUE_DELETE_ERRORS_TOTAL_NAME) + .help(QUEUE_DELETE_ERRORS_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.messageParseErrorsTotal = + Counter.builder() + .name(MESSAGE_PARSE_ERRORS_TOTAL_NAME) + .help(MESSAGE_PARSE_ERRORS_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + this.pollRequestsTotal = + Counter.builder() + .name(POLL_REQUESTS_TOTAL_NAME) + .help(POLL_REQUESTS_TOTAL_HELP) + .labelNames(WATCH_LABELS) + .register(); + + logger.info("InsertWatch Prometheus metrics initialized"); + } + + public void recordMessagesReceived(String table, String queue, String queueType, int count) { + messagesReceivedTotal.labelValues(table, queue, queueType).inc(count); + } + + public void recordEventsReceived(String table, String queue, String queueType, int count) { + eventsReceivedTotal.labelValues(table, queue, queueType).inc(count); + } + + public void recordEventMatched(String table, String queue, String queueType) { + eventsMatchedTotal.labelValues(table, queue, queueType).inc(); + } + + public void recordEventNotMatched(String table, String queue, String queueType) { + eventsNotMatchedTotal.labelValues(table, queue, queueType).inc(); + } + + public void recordEventSkipped(String table, String queue, String queueType) { + eventsSkippedTotal.labelValues(table, queue, queueType).inc(); + } + + public void recordFilesInserted(String table, String queue, String queueType, int count) { + filesInsertedTotal.labelValues(table, queue, queueType).inc(count); + } + + public void recordTransactionSuccess(String table, String queue, String queueType) { + transactionsTotal.labelValues(table, queue, queueType).inc(); + } + + public void recordTransactionFailed(String table, String queue, String queueType) { + transactionsFailedTotal.labelValues(table, queue, queueType).inc(); + } + + public void recordRetryAttempt(String table, String queue, String queueType) { + retryAttemptsTotal.labelValues(table, queue, queueType).inc(); + } + + public void recordQueueReceiveError(String table, String queue, String queueType) { + queueReceiveErrorsTotal.labelValues(table, queue, queueType).inc(); + } + + public void recordQueueDeleteError(String table, String queue, String queueType, int count) { + queueDeleteErrorsTotal.labelValues(table, queue, queueType).inc(count); + } + + public void recordMessageParseError(String table, String queue, String queueType) { + messageParseErrorsTotal.labelValues(table, queue, queueType).inc(); + } + + public void recordPollRequest(String table, String queue, String queueType) { + pollRequestsTotal.labelValues(table, queue, queueType).inc(); + } +}