diff --git a/README.md b/README.md index e222898..720e278 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ # Severed Infra: Cloud-Native Home Lab This repository contains the Infrastructure-as-Code (IaC) and manifest definitions for **Severed**, a modern blog and -observability stack running on Kubernetes (K3d). It demonstrates how to decouple configuration from code, automate -observability, and secure internal services—all running locally on your laptop. +observability stack running on Kubernetes (K3d). ## Architecture diff --git a/apps/severed-blog-config.yaml b/apps/severed-blog-config.yaml index 4033445..c91a4ab 100644 --- a/apps/severed-blog-config.yaml +++ b/apps/severed-blog-config.yaml @@ -5,12 +5,19 @@ metadata: namespace: severed-apps data: default.conf: | + # 1. Define the custom log format + log_format observability '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$request_time"'; + server { listen 80; server_name localhost; root /usr/share/nginx/html; index index.html index.htm; - access_log /dev/stdout; + + # 2. Apply the format to stdout + access_log /dev/stdout observability; error_log /dev/stderr; # gzip compression @@ -50,10 +57,10 @@ data: # metrics endpoint for Alloy/Prometheus location /metrics { stub_status on; - access_log off; + access_log off; # Keep noise out of our main logs allow 127.0.0.1; - allow 10.0.0.0/8; # Allow internal cluster pods - allow 172.16.0.0/12; # Allow K3d/Docker internal bridge network + allow 10.0.0.0/8; + allow 172.16.0.0/12; deny all; } } diff --git a/apps/severed-blog-hpa.yaml b/apps/severed-blog-hpa.yaml index c37a246..6559975 100644 --- a/apps/severed-blog-hpa.yaml +++ b/apps/severed-blog-hpa.yaml @@ -29,7 +29,7 @@ spec: name: nginx_http_requests_total target: type: AverageValue - averageValue: "10k" # Scale up if requests > 10K per second per pod + averageValue: "500" # Scale up if requests > 500 per second per pod behavior: scaleDown: stabilizationWindowSeconds: 300 # Wait 5 minutes before removing a pod diff --git a/infra/alloy-setup.yaml b/infra/alloy-setup.yaml index ba8feb5..505ef57 100644 --- a/infra/alloy-setup.yaml +++ b/infra/alloy-setup.yaml @@ -11,12 +11,21 @@ kind: ClusterRole metadata: name: alloy-cluster-role rules: + # 1. Standard API Access - apiGroups: [ "" ] resources: [ "nodes", "nodes/proxy", "services", "endpoints", "pods" ] verbs: [ "get", "list", "watch" ] + # 2. ALLOW METRICS ACCESS (Crucial for cAdvisor/Kubelet) + - apiGroups: [ "" ] + resources: [ "nodes/stats", "nodes/metrics" ] + verbs: [ "get" ] + # 3. Log Access - apiGroups: [ "" ] resources: [ "pods/log" ] verbs: [ "get", "list", "watch" ] + # 4. Non-Resource URLs (Sometimes needed for /metrics endpoints) + - nonResourceURLs: ["/metrics"] + verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 @@ -84,8 +93,6 @@ data: targets = discovery.relabel.blog_pods.output forward_to = [prometheus.remote_write.metrics_service.receiver] job_name = "integrations/nginx" - - // Removed the restrictive metric_relabel to ensure data flows } // 4. Host Metrics (Unix Exporter) @@ -118,6 +125,26 @@ data: url = sys.env("LOKI_URL") } } + + // 7. Kubelet Scraper (cAdvisor for Container Metrics) + discovery.kubernetes "k8s_nodes" { + role = "node" + } + + prometheus.scrape "kubelet_cadvisor" { + targets = discovery.kubernetes.k8s_nodes.targets + scheme = "https" + metrics_path = "/metrics/cadvisor" + job_name = "integrations/kubernetes/cadvisor" + + tls_config { + insecure_skip_verify = true + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + forward_to = [prometheus.remote_write.metrics_service.receiver] + } + --- # --- Agent Deployment (DaemonSet) --- apiVersion: apps/v1 @@ -144,6 +171,7 @@ spec: args: - run - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data - /etc/alloy/config.alloy envFrom: - configMapRef: diff --git a/infra/observer/dashboard-json.yaml b/infra/observer/dashboard-json.yaml index e06aaaf..b11bd28 100644 --- a/infra/observer/dashboard-json.yaml +++ b/infra/observer/dashboard-json.yaml @@ -25,7 +25,7 @@ data: "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 0, + "id": 1, "links": [], "panels": [ { @@ -128,7 +128,7 @@ data: }, "editorMode": "code", "expr": "node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes", - "legendFormat": "Used Memory", + "legendFormat": "{{instance}}", "range": true, "refId": "A" } @@ -336,6 +336,151 @@ data: "type": "loki", "uid": "Loki" }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^4\\d{2}$/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/^5\\d{2}$/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/^3\\d{2}$/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 9 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "direction": "backward", + "editorMode": "code", + "expr": "sum by (status) (\n count_over_time({namespace=\"severed-apps\"} \n | regexp `HTTP/1.1\" (?P[12345]\\d{2})` [1m])\n)", + "legendFormat": "{{status}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "HTTP Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, "fieldConfig": { "defaults": { "color": { @@ -376,6 +521,7 @@ data: } }, "mappings": [], + "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ @@ -395,10 +541,10 @@ data: "gridPos": { "h": 8, "w": 8, - "x": 8, + "x": 16, "y": 9 }, - "id": 4, + "id": 9, "options": { "legend": { "calcs": [], @@ -415,18 +561,24 @@ data: "pluginVersion": "12.3.1", "targets": [ { - "expr": "sum(rate({app=\"severed-blog\"}[1m]))", - "legendFormat": "Requests/sec", + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(nginx_connections_active{namespace=\"severed-apps\"})", + "legendFormat": "Live Connections", + "range": true, "refId": "A" } ], - "title": "Web Traffic (RPS)", + "title": "Live User Count", "type": "timeseries" }, { "datasource": { - "type": "loki", - "uid": "Loki" + "type": "prometheus", + "uid": "PBFA97CFB590B2093" }, "fieldConfig": { "defaults": { @@ -482,15 +634,28 @@ data: ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Replicas (instant scale when 1 of the metrics >= 100% Saturation)" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + } + ] + } + ] }, "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 9 + "h": 9, + "w": 14, + "x": 0, + "y": 17 }, - "id": 5, + "id": 8, "options": { "legend": { "calcs": [], @@ -507,13 +672,287 @@ data: "pluginVersion": "12.3.1", "targets": [ { - "expr": "sum by (status) (count_over_time({app=\"severed-blog\"} | regexp `HTTP/1.1\" (?P[45]\\d{2})` [1m]))", - "legendFormat": "HTTP {{status}}", + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "(sum(rate(nginx_http_requests_total{namespace=\"severed-apps\"}[1m])) / count(nginx_http_requests_total{namespace=\"severed-apps\"})) / 500 * 100", + "legendFormat": "Traffic Saturation (100% Saturation = 500RPS)", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"severed-apps\", pod=~\"severed-blog.*\", container!=\"\"}[1m])) / 0.1 * 100", + "hide": false, + "instant": false, + "legendFormat": "CPU Saturation (100% Saturation = 90% Usage)", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(container_memory_working_set_bytes{namespace=\"severed-apps\", pod=~\"severed-blog.*\", container!=\"\"}) / (100 * 1024 * 1024) * 100", + "hide": false, + "instant": false, + "legendFormat": "Memory Saturation (100% Saturation = 80% Usage)", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "count(nginx_http_requests_total{job=\"integrations/nginx\"})", + "hide": false, + "instant": false, + "legendFormat": "Replicas (instant scale when 1 of the metrics >= 100% Saturation)", + "range": true, + "refId": "D" + } + ], + "title": "HPA Drivers: Saturation vs. Scaling", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "footer": { + "reducers": [] + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Status" + }, + "properties": [ + { + "id": "custom.width", + "value": 97 + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 10, + "x": 14, + "y": 17 + }, + "id": 10, + "options": { + "cellHeight": "sm", + "frameIndex": 0, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "up" + } + ] + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "exemplar": false, + "expr": "up{job=\"integrations/nginx\"} ", + "instant": true, + "range": false, "refId": "A" } ], - "title": "HTTP Errors (4xx/5xx)", - "type": "timeseries" + "title": "Pods Status", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "keepLabels": [ + "pod" + ] + } + }, + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "pod": "Pod Name", + "up": "Status" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^4\\d{2}$/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/^5\\d{2}$/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/^3\\d{2}$/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 26 + }, + "id": 11, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "sort": "desc", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "direction": "backward", + "editorMode": "code", + "expr": "sum by (status) (\n count_over_time(\n {namespace=\"severed-apps\"} |= \"HTTP/1.1\" | regexp `HTTP/1.1\" (?P[12345]\\d{2})` [24h]\n )\n)", + "legendFormat": "{{status}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Traffic Quality (24h)", + "type": "piechart" } ], "preload": false, @@ -531,5 +970,5 @@ data: "timezone": "", "title": "Severed Cluster Health", "uid": "severed-health", - "version": 9 - } \ No newline at end of file + "version": 1 + } diff --git a/infra/observer/loki.yaml b/infra/observer/loki.yaml index 61f2c47..c3a6609 100644 --- a/infra/observer/loki.yaml +++ b/infra/observer/loki.yaml @@ -29,8 +29,6 @@ data: index: prefix: index_ period: 24h - limits_config: - allow_structured_metadata: true --- # --- Storage Service (Headless) --- @@ -67,8 +65,6 @@ spec: labels: app: loki spec: - # securityContext: - # fsGroup: 10001 # Often needed for Loki write permissions containers: - name: loki image: grafana/loki:latest @@ -86,7 +82,7 @@ spec: - name: config configMap: name: loki-config - # Persistent Storage: Automatically creates a Volume for data retention + # Persistent Storage volumeClaimTemplates: - metadata: name: data diff --git a/infra/observer/prometheus.yaml b/infra/observer/prometheus.yaml index abc3500..f5e1a56 100644 --- a/infra/observer/prometheus.yaml +++ b/infra/observer/prometheus.yaml @@ -13,6 +13,18 @@ data: tsdb: out_of_order_time_window: 1m + scrape_configs: + # 1. Scrape Prometheus itself (Health Check) + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # 2. Scrape Kube State Metrics (KSM) + # We use the internal DNS: service-name.namespace.svc.cluster.local:port + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.monitoring.svc.cluster.local:8080'] + --- # Service apiVersion: v1 diff --git a/scripts/README.md b/scripts/README.md index df2531e..f9b3281 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -48,6 +48,8 @@ kubectl logs -n monitoring -l name=alloy --tail=50 [//]: # (kubectl rollout restart deployment severed-blog -n severed-apps) [//]: # (kubectl logs -n severed-apps -l app=severed-blog -f) +[//]: # (kubectl logs loki-0 -n monitoring --tail=20) + * **Internal Handshake:** Use your `access-hub.sh` script and visit `localhost:12345`. * Find the `prometheus.exporter.nginx.blog` component. * Ensure the health status is **Green/Up**. diff --git a/scripts/deploy-all.sh b/scripts/deploy-all.sh index 3251232..7f45f80 100644 --- a/scripts/deploy-all.sh +++ b/scripts/deploy-all.sh @@ -20,6 +20,12 @@ kubectl create secret generic grafana-secrets -n monitoring \ --from-literal=admin-password=admin \ --dry-run=client -o yaml | kubectl apply -f - +# Kube State Metrics (KSM) +echo "Installing kube-state-metrics..." +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm upgrade --install kube-state-metrics prometheus-community/kube-state-metrics --namespace monitoring + kubectl apply -f infra/observer/loki.yaml kubectl apply -f infra/observer/prometheus.yaml kubectl apply -f infra/alloy-env.yaml