added cAdvisor, ksm, improved dashboard-json.yaml, updated deploy all, added 7-day data retention to logs, certain data metrics are scraped directly from prometheus instead of from logs,

This commit is contained in:
wboughattas
2025-12-29 22:16:16 -05:00
parent ced3c41e7a
commit 181ab4f76e
9 changed files with 524 additions and 35 deletions

View File

@@ -1,8 +1,7 @@
# Severed Infra: Cloud-Native Home Lab
This repository contains the Infrastructure-as-Code (IaC) and manifest definitions for **Severed**, a modern blog and
observability stack running on Kubernetes (K3d). It demonstrates how to decouple configuration from code, automate
observability, and secure internal services—all running locally on your laptop.
observability stack running on Kubernetes (K3d).
## Architecture

View File

@@ -5,12 +5,19 @@ metadata:
namespace: severed-apps
data:
default.conf: |
# 1. Define the custom log format
log_format observability '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$request_time"';
server {
listen 80;
server_name localhost;
root /usr/share/nginx/html;
index index.html index.htm;
access_log /dev/stdout;
# 2. Apply the format to stdout
access_log /dev/stdout observability;
error_log /dev/stderr;
# gzip compression
@@ -50,10 +57,10 @@ data:
# metrics endpoint for Alloy/Prometheus
location /metrics {
stub_status on;
access_log off;
access_log off; # Keep noise out of our main logs
allow 127.0.0.1;
allow 10.0.0.0/8; # Allow internal cluster pods
allow 172.16.0.0/12; # Allow K3d/Docker internal bridge network
allow 10.0.0.0/8;
allow 172.16.0.0/12;
deny all;
}
}

View File

@@ -29,7 +29,7 @@ spec:
name: nginx_http_requests_total
target:
type: AverageValue
averageValue: "10k" # Scale up if requests > 10K per second per pod
averageValue: "500" # Scale up if requests > 500 per second per pod
behavior:
scaleDown:
stabilizationWindowSeconds: 300 # Wait 5 minutes before removing a pod

View File

@@ -11,12 +11,21 @@ kind: ClusterRole
metadata:
name: alloy-cluster-role
rules:
# 1. Standard API Access
- apiGroups: [ "" ]
resources: [ "nodes", "nodes/proxy", "services", "endpoints", "pods" ]
verbs: [ "get", "list", "watch" ]
# 2. ALLOW METRICS ACCESS (Crucial for cAdvisor/Kubelet)
- apiGroups: [ "" ]
resources: [ "nodes/stats", "nodes/metrics" ]
verbs: [ "get" ]
# 3. Log Access
- apiGroups: [ "" ]
resources: [ "pods/log" ]
verbs: [ "get", "list", "watch" ]
# 4. Non-Resource URLs (Sometimes needed for /metrics endpoints)
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
@@ -84,8 +93,6 @@ data:
targets = discovery.relabel.blog_pods.output
forward_to = [prometheus.remote_write.metrics_service.receiver]
job_name = "integrations/nginx"
// Removed the restrictive metric_relabel to ensure data flows
}
// 4. Host Metrics (Unix Exporter)
@@ -118,6 +125,26 @@ data:
url = sys.env("LOKI_URL")
}
}
// 7. Kubelet Scraper (cAdvisor for Container Metrics)
discovery.kubernetes "k8s_nodes" {
role = "node"
}
prometheus.scrape "kubelet_cadvisor" {
targets = discovery.kubernetes.k8s_nodes.targets
scheme = "https"
metrics_path = "/metrics/cadvisor"
job_name = "integrations/kubernetes/cadvisor"
tls_config {
insecure_skip_verify = true
}
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
forward_to = [prometheus.remote_write.metrics_service.receiver]
}
---
# --- Agent Deployment (DaemonSet) ---
apiVersion: apps/v1
@@ -144,6 +171,7 @@ spec:
args:
- run
- --server.http.listen-addr=0.0.0.0:12345
- --storage.path=/var/lib/alloy/data
- /etc/alloy/config.alloy
envFrom:
- configMapRef:

View File

@@ -25,7 +25,7 @@ data:
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 0,
"id": 1,
"links": [],
"panels": [
{
@@ -128,7 +128,7 @@ data:
},
"editorMode": "code",
"expr": "node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes",
"legendFormat": "Used Memory",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
@@ -336,6 +336,151 @@ data:
"type": "loki",
"uid": "Loki"
},
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "green",
"mode": "fixed",
"seriesBy": "last"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"fieldMinMax": false,
"mappings": [],
"noValue": "0",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/^4\\d{2}$/"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": "/^5\\d{2}$/"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "purple",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": "/^3\\d{2}$/"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 9
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"direction": "backward",
"editorMode": "code",
"expr": "sum by (status) (\n count_over_time({namespace=\"severed-apps\"} \n | regexp `HTTP/1.1\" (?P<status>[12345]\\d{2})` [1m])\n)",
"legendFormat": "{{status}}",
"queryType": "range",
"refId": "A"
}
],
"title": "HTTP Requests",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
@@ -376,6 +521,7 @@ data:
}
},
"mappings": [],
"noValue": "0",
"thresholds": {
"mode": "absolute",
"steps": [
@@ -395,10 +541,10 @@ data:
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"x": 16,
"y": 9
},
"id": 4,
"id": 9,
"options": {
"legend": {
"calcs": [],
@@ -415,18 +561,24 @@ data:
"pluginVersion": "12.3.1",
"targets": [
{
"expr": "sum(rate({app=\"severed-blog\"}[1m]))",
"legendFormat": "Requests/sec",
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum(nginx_connections_active{namespace=\"severed-apps\"})",
"legendFormat": "Live Connections",
"range": true,
"refId": "A"
}
],
"title": "Web Traffic (RPS)",
"title": "Live User Count",
"type": "timeseries"
},
{
"datasource": {
"type": "loki",
"uid": "Loki"
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
@@ -482,15 +634,28 @@ data:
]
}
},
"overrides": []
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Replicas (instant scale when 1 of the metrics >= 100% Saturation)"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "right"
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 9
"h": 9,
"w": 14,
"x": 0,
"y": 17
},
"id": 5,
"id": 8,
"options": {
"legend": {
"calcs": [],
@@ -507,13 +672,287 @@ data:
"pluginVersion": "12.3.1",
"targets": [
{
"expr": "sum by (status) (count_over_time({app=\"severed-blog\"} | regexp `HTTP/1.1\" (?P<status>[45]\\d{2})` [1m]))",
"legendFormat": "HTTP {{status}}",
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "(sum(rate(nginx_http_requests_total{namespace=\"severed-apps\"}[1m])) / count(nginx_http_requests_total{namespace=\"severed-apps\"})) / 500 * 100",
"legendFormat": "Traffic Saturation (100% Saturation = 500RPS)",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"severed-apps\", pod=~\"severed-blog.*\", container!=\"\"}[1m])) / 0.1 * 100",
"hide": false,
"instant": false,
"legendFormat": "CPU Saturation (100% Saturation = 90% Usage)",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum(container_memory_working_set_bytes{namespace=\"severed-apps\", pod=~\"severed-blog.*\", container!=\"\"}) / (100 * 1024 * 1024) * 100",
"hide": false,
"instant": false,
"legendFormat": "Memory Saturation (100% Saturation = 80% Usage)",
"range": true,
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "count(nginx_http_requests_total{job=\"integrations/nginx\"})",
"hide": false,
"instant": false,
"legendFormat": "Replicas (instant scale when 1 of the metrics >= 100% Saturation)",
"range": true,
"refId": "D"
}
],
"title": "HPA Drivers: Saturation vs. Scaling",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"cellOptions": {
"type": "auto"
},
"footer": {
"reducers": []
},
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Status"
},
"properties": [
{
"id": "custom.width",
"value": 97
}
]
}
]
},
"gridPos": {
"h": 5,
"w": 10,
"x": 14,
"y": 17
},
"id": 10,
"options": {
"cellHeight": "sm",
"frameIndex": 0,
"showHeader": true,
"sortBy": [
{
"desc": false,
"displayName": "up"
}
]
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"exemplar": false,
"expr": "up{job=\"integrations/nginx\"} ",
"instant": true,
"range": false,
"refId": "A"
}
],
"title": "HTTP Errors (4xx/5xx)",
"type": "timeseries"
"title": "Pods Status",
"transformations": [
{
"id": "labelsToFields",
"options": {
"keepLabels": [
"pod"
]
}
},
{
"id": "merge",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
},
"includeByName": {},
"indexByName": {},
"renameByName": {
"pod": "Pod Name",
"up": "Status"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "green",
"mode": "fixed"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"mappings": []
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/^4\\d{2}$/"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": "/^5\\d{2}$/"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "purple",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": "/^3\\d{2}$/"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 5,
"x": 0,
"y": 26
},
"id": 11,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"pieType": "pie",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"sort": "desc",
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"direction": "backward",
"editorMode": "code",
"expr": "sum by (status) (\n count_over_time(\n {namespace=\"severed-apps\"} |= \"HTTP/1.1\" | regexp `HTTP/1.1\" (?P<status>[12345]\\d{2})` [24h]\n )\n)",
"legendFormat": "{{status}}",
"queryType": "range",
"refId": "A"
}
],
"title": "Traffic Quality (24h)",
"type": "piechart"
}
],
"preload": false,
@@ -531,5 +970,5 @@ data:
"timezone": "",
"title": "Severed Cluster Health",
"uid": "severed-health",
"version": 9
}
"version": 1
}

View File

@@ -29,8 +29,6 @@ data:
index:
prefix: index_
period: 24h
limits_config:
allow_structured_metadata: true
---
# --- Storage Service (Headless) ---
@@ -67,8 +65,6 @@ spec:
labels:
app: loki
spec:
# securityContext:
# fsGroup: 10001 # Often needed for Loki write permissions
containers:
- name: loki
image: grafana/loki:latest
@@ -86,7 +82,7 @@ spec:
- name: config
configMap:
name: loki-config
# Persistent Storage: Automatically creates a Volume for data retention
# Persistent Storage
volumeClaimTemplates:
- metadata:
name: data

View File

@@ -13,6 +13,18 @@ data:
tsdb:
out_of_order_time_window: 1m
scrape_configs:
# 1. Scrape Prometheus itself (Health Check)
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# 2. Scrape Kube State Metrics (KSM)
# We use the internal DNS: service-name.namespace.svc.cluster.local:port
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.monitoring.svc.cluster.local:8080']
---
# Service
apiVersion: v1

View File

@@ -48,6 +48,8 @@ kubectl logs -n monitoring -l name=alloy --tail=50
[//]: # (kubectl rollout restart deployment severed-blog -n severed-apps)
[//]: # (kubectl logs -n severed-apps -l app=severed-blog -f)
[//]: # (kubectl logs loki-0 -n monitoring --tail=20)
* **Internal Handshake:** Use your `access-hub.sh` script and visit `localhost:12345`.
* Find the `prometheus.exporter.nginx.blog` component.
* Ensure the health status is **Green/Up**.

View File

@@ -20,6 +20,12 @@ kubectl create secret generic grafana-secrets -n monitoring \
--from-literal=admin-password=admin \
--dry-run=client -o yaml | kubectl apply -f -
# Kube State Metrics (KSM)
echo "Installing kube-state-metrics..."
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
helm upgrade --install kube-state-metrics prometheus-community/kube-state-metrics --namespace monitoring
kubectl apply -f infra/observer/loki.yaml
kubectl apply -f infra/observer/prometheus.yaml
kubectl apply -f infra/alloy-env.yaml