diff --git a/app_python/app.py b/app_python/app.py index 77e4af2f8a..14d3caa4c0 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -3,18 +3,50 @@ Main application module """ import json -from flask import Flask, jsonify, request +from flask import Flask, Response, jsonify, request from datetime import datetime, timezone import logging import os import platform import socket +from prometheus_client import Counter, Histogram, Gauge, generate_latest + HOST = os.getenv('HOST', '0.0.0.0') PORT = int(os.getenv('PORT', 5000)) DEBUG = os.getenv('DEBUG', 'False').lower() == 'true' +class PrometheusStats: + http_requests_total: Counter + http_request_duration_seconds: Histogram + http_requests_in_progress: Gauge + system_info_duration_seconds: Histogram + + def __init__(self): + self.http_requests_total = Counter( + 'http_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status'] + ) + self.http_request_duration_seconds = Histogram( + 'http_request_duration_seconds', + 'HTTP request duration', + ['method', 'endpoint'] + ) + self.http_requests_in_progress = Gauge( + 'http_requests_in_progress', + 'HTTP requests currently being processed' + ) + self.system_info_duration_seconds = Histogram( + 'system_info_duration_seconds', + 'System stats collection time' + ) + + +prometheus = PrometheusStats() + + class JSONFormatter(logging.Formatter): def __init__(self): super().__init__() @@ -92,27 +124,33 @@ def get_http_extra_info(): @app.route('/') +@prometheus.http_request_duration_seconds.labels('GET', '/').time() +@prometheus.http_requests_in_progress.track_inprogress() def index(): """Main endpoint - service and system information.""" logger.debug(f'Request: {request.method} {request.path}', extra=get_http_extra_info()) - return jsonify({ - 'service': { - 'name': 'devops-info-service', - 'version': '1.0.0', - 'description': 'DevOps course info service', - 'framework': 'Flask' - }, - 'system': get_system_info(), - 'runtime': get_uptime(), - 'request': get_request_info(), - 'endpoints': [ - {"path": "/", "method": "GET", "description": "Service information"}, - {"path": "/health", "method": "GET", "description": "Health check"} - ] - }) + with prometheus.system_info_duration_seconds.time(): + response = { + 'service': { + 'name': 'devops-info-service', + 'version': '1.0.0', + 'description': 'DevOps course info service', + 'framework': 'Flask' + }, + 'system': get_system_info(), + 'runtime': get_uptime(), + 'request': get_request_info(), + 'endpoints': [ + {"path": "/", "method": "GET", "description": "Service information"}, + {"path": "/health", "method": "GET", "description": "Health check"} + ] + } + return jsonify(response) @app.route('/health') +@prometheus.http_request_duration_seconds.labels('GET', '/health').time() +@prometheus.http_requests_in_progress.track_inprogress() def health(): logger.debug(f'Request: {request.method} {request.path}', extra=get_http_extra_info()) return jsonify({ @@ -122,6 +160,13 @@ def health(): }) +@app.route('/metrics') +@prometheus.http_request_duration_seconds.labels('GET', '/metrics').time() +@prometheus.http_requests_in_progress.track_inprogress() +def metrics(): + return Response(response=generate_latest(), status=200, content_type='text/plain') + + @app.errorhandler(404) def notfound_handler(e): logger.info('A 404 Not Found error occured', extra=get_http_extra_info()) @@ -140,6 +185,13 @@ def internal_error(e): }), 500 +@app.after_request +def after_request(response: Response): + prometheus.http_requests_total.labels(request.method, request.path, + str(response.status_code)).inc() + return response + + START_TIME = datetime.now(timezone.utc) logger.info('Application starting... Configured with log level=%s', 'DEBUG' if DEBUG else 'INFO') if __name__ == '__main__': diff --git a/app_python/requirements.txt b/app_python/requirements.txt index f0950a10ff..ad21365013 100644 --- a/app_python/requirements.txt +++ b/app_python/requirements.txt @@ -1,2 +1,3 @@ Flask==3.1.0 gunicorn==24.0.0 +prometheus-client==0.23.1 diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index df98f69d30..00a5f1cc63 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -17,7 +17,7 @@ services: resources: limits: cpus: '1.0' - memory: 3G + memory: 1G reservations: cpus: '1' memory: 1G @@ -52,11 +52,8 @@ services: deploy: resources: limits: - cpus: '1.0' - memory: 1G - reservations: - cpus: '1' - memory: 500M + cpus: '0.5' + memory: 512M healthcheck: test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] interval: 10s @@ -76,12 +73,37 @@ services: deploy: resources: limits: - cpus: '3.0' - memory: 4G - reservations: - cpus: '3' + cpus: '0.5' + memory: 256M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:5000/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + + prometheus: + image: prom/prometheus:v3.9.0 + ports: + - 9090:9090 + volumes: + - ${PWD}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus:rw + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=15d' + - '--storage.tsdb.retention.size=10GB' + deploy: + resources: + limits: memory: 1G + cpus: '1.0' + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 volumes: loki-data: grafana-data: + prometheus-data: diff --git a/monitoring/docs/L8t1_metrics_endpoint.png b/monitoring/docs/L8t1_metrics_endpoint.png new file mode 100644 index 0000000000..4d02ea772f Binary files /dev/null and b/monitoring/docs/L8t1_metrics_endpoint.png differ diff --git a/monitoring/docs/L8t2_query.png b/monitoring/docs/L8t2_query.png new file mode 100644 index 0000000000..d45e53a0d9 Binary files /dev/null and b/monitoring/docs/L8t2_query.png differ diff --git a/monitoring/docs/L8t2_targets_up.png b/monitoring/docs/L8t2_targets_up.png new file mode 100644 index 0000000000..daf9d61e2d Binary files /dev/null and b/monitoring/docs/L8t2_targets_up.png differ diff --git a/monitoring/docs/L8t3_custom_dashboard.png b/monitoring/docs/L8t3_custom_dashboard.png new file mode 100644 index 0000000000..cb9c621d26 Binary files /dev/null and b/monitoring/docs/L8t3_custom_dashboard.png differ diff --git a/monitoring/docs/L8t4_healthy.png b/monitoring/docs/L8t4_healthy.png new file mode 100644 index 0000000000..fad85d6522 Binary files /dev/null and b/monitoring/docs/L8t4_healthy.png differ diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..70b9dd443a --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,746 @@ +# Task 1 + +### Screenshot of `/metrics` + +![metrics](/monitoring/docs/L8t1_metrics_endpoint.png) + +### Definition of metrics + +```python +# ... +from prometheus_client import Counter, Histogram, Gauge, generate_latest + +# ... + +class PrometheusStats: + http_requests_total: Counter + http_request_duration_seconds: Histogram + http_requests_in_progress: Gauge + system_info_duration_seconds: Histogram + + def __init__(self): + self.http_requests_total = Counter( + 'http_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status'] + ) + self.http_request_duration_seconds = Histogram( + 'http_request_duration_seconds', + 'HTTP request duration', + ['method', 'endpoint'] + ) + self.http_requests_in_progress = Gauge( + 'http_requests_in_progress', + 'HTTP requests currently being processed' + ) + self.system_info_duration_seconds = Histogram( + 'system_info_duration_seconds', + 'System stats collection time' + ) + + +prometheus = PrometheusStats() +# ... +``` + +### Explanation of metrics choice + +The HTTP-related metrics in the declaration above are required by the lab. + +The only logic-related metric that made sense in the context of the app is the system info query time, so I implemented +it similarly to `http_request_duration_seconds`, as a Histogram. I could subdivide it into queries to different parts of +the system, but the metric reports very low time consumption already, so I think there is no point in that. + +# Task 2 + +### All targets up + +![targets](/monitoring/docs/L8t2_targets_up.png) + +### Successful query + +![query](/monitoring/docs/L8t2_query.png) + +### Prometheus config + +```yaml +global: + scrape_interval: 15s + evaluation_interval: 15s + +# Storage retention (Prometheus 3.x config-based retention) +storage: + tsdb: + retention: + time: 15d + size: 10GB + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'app' + static_configs: + - targets: ['devops-infoservice:5000'] + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] +``` + +# Task 3 + +### Custom dashboard with panels + +![panels](/monitoring/docs/L8t3_custom_dashboard.png) + +### JSON dashboard definition + +```json +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "bfggznoog7gn4c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 5, + "x": 0, + "y": 0 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "up{job=\"app\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bfggznoog7gn4c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 7, + "x": 5, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bfggznoog7gn4c" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total[5m])) by (endpoint)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bfggznoog7gn4c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "http_requests_in_progress", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bfggznoog7gn4c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 3 + }, + "id": 6, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "sort": "desc", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (status) (rate(http_requests_total[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Status code distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bfggznoog7gn4c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bfggznoog7gn4c" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 4, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(http_request_duration_seconds_bucket[5m])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request duration heatmap", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bfggznoog7gn4c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Error rate", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Prometheus custom", + "uid": "adgptz2", + "version": 11 +} +``` + +# Task 4 + +Every container is healthy: + +![health](/monitoring/docs/L8t4_healthy.png) + +# Task 5 + +**Documentation**. I added everything according to the lab and followed every step. The architecture is as follows: + +```text +[Devops-infoservice] -----------metrics--------> [Prometheus] + | | + logs query_responses + | | + v v + [Promtail] --logs-> [Loki] --query_responses--> [Grafana] +``` + +For further evidence and code listings, see sections above. diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..1a516fdca8 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,27 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +# Storage retention (Prometheus 3.x config-based retention) +storage: + tsdb: + retention: + time: 15d + size: 10GB + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'app' + static_configs: + - targets: ['devops-infoservice:5000'] + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000']