fixed nginx_http_requests_total, added sh scripts

2025-12-28 23:37:34 -05:00
parent ca3c590bdd
commit f0738d7d7b
9 changed files with 242 additions and 51 deletions
--- a/apps/severed-blog-config.yaml
+++ b/apps/severed-blog-config.yaml
@@ -51,6 +51,7 @@ data:
        access_log off;
        allow 127.0.0.1;
        allow 10.0.0.0/8; # Allow internal cluster pods
+        allow 172.16.0.0/12; # Allow K3d/Docker internal bridge network
        deny all;
      }
    }
--- a/apps/severed-blog-hpa.yaml
+++ b/apps/severed-blog-hpa.yaml
@@ -26,7 +26,7 @@ spec:
    - type: Pods
      pods:
        metric:
-          name: http_requests_per_second
+          name: nginx_http_requests_total
        target:
          type: AverageValue
          averageValue: 10            # Scale up if requests > 10 per second per pod
--- a/apps/severed-blog.yaml
+++ b/apps/severed-blog.yaml
@@ -21,15 +21,33 @@ spec:
            - containerPort: 80
          resources:
            requests:
-              cpu: "50m"      # 0.05 cores (Guaranteed to the pod)
-              memory: "64Mi"  # 64 Megabytes
+              cpu: "50m"
+              memory: "64Mi"
            limits:
-              cpu: "200m"     # Max allowed (Prevents one pod from eating the Mac's CPU)
-              memory: "128Mi" # Max allowed
+              cpu: "200m"
+              memory: "128Mi"
          volumeMounts:
            - name: nginx-config-vol
              mountPath: /etc/nginx/conf.d/default.conf
              subPath: default.conf
+
+        # --- ADD THE EXPORTER SIDECAR HERE ---
+        - name: exporter
+          image: nginx/nginx-prometheus-exporter:latest
+          args:
+            - -nginx.scrape-uri=http://localhost:80/metrics
+          ports:
+            - containerPort: 9113
+              name: metrics
+          resources:
+            requests:
+              cpu: "10m"
+              memory: "32Mi"
+            limits:
+              cpu: "50m"
+              memory: "64Mi"
+        # -------------------------------------
+
      volumes:
        - name: nginx-config-vol
          configMap:
--- a/infra/alloy-setup.yaml
+++ b/infra/alloy-setup.yaml
@@ -1,6 +1,4 @@
 # --- RBAC configuration ---
-# creates a serviceaccount with permissions to discover pods and read logs.
-
 apiVersion: v1
 kind: ServiceAccount
 metadata:
@@ -13,11 +11,9 @@ kind: ClusterRole
 metadata:
  name: alloy-cluster-role
 rules:
-  # discovery permissions: allows alloy to find targets: Nodes, Pods, Services.
  - apiGroups: [ "" ]
    resources: [ "nodes", "nodes/proxy", "services", "endpoints", "pods" ]
    verbs: [ "get", "list", "watch" ]
-  # log access: required for 'loki.source.kubernetes' to tail logs.
  - apiGroups: [ "" ]
    resources: [ "pods/log" ]
    verbs: [ "get", "list", "watch" ]
@@ -38,8 +34,6 @@ subjects:

 ---
 # --- Alloy pipeline configuration ---
-# defines how telemetry data is collected, processed, and exported.
-
 apiVersion: v1
 kind: ConfigMap
 metadata:
@@ -47,51 +41,39 @@ metadata:
  namespace: monitoring
 data:
  config.alloy: |
-    // 1. discovery
+    // 1. Discovery: Find all pods
    discovery.kubernetes "k8s_pods" {
      role = "pod"
    }

-    // 2. Relabeling (MUST BE DEFINED BEFORE USE)
-    // This adds 'app', 'namespace', and 'pod' labels to the targets
-    discovery.relabel "k8s_labels" {
+    // 2. Relabeling: Filter for ONLY the blog pods
+    discovery.relabel "blog_pods" {
      targets = discovery.kubernetes.k8s_pods.targets
    
      rule {
-        action        = "replace"
-        source_labels = ["__meta_kubernetes_pod_label_app"]
-        target_label  = "app"
-      }
-      rule {
-        action        = "replace"
-        source_labels = ["__meta_kubernetes_namespace"]
-        target_label  = "namespace"
-      }
-      rule {
-        action        = "replace"
-        source_labels = ["__meta_kubernetes_pod_name"]
-        target_label  = "pod"
-      }
-    }
-
-    // 3. Filter for Blog Metrics
-    // Only pass targets where the label 'app' is 'severed-blog'
-    discovery.relabel "blog_filter" {
-      targets = discovery.relabel.k8s_labels.output
-      rule {
-        source_labels = ["app"]
-        regex         = "severed-blog"
        action = "keep"
+        source_labels = ["__meta_kubernetes_pod_label_app"]
+        regex = "severed-blog"
+      }
+
+      rule {
+        action = "replace"
+        source_labels = ["__address__"]
+        target_label = "__address__"
+        regex = "([^:]+)(?::\\d+)?"
+        replacement = "$1:9113"
      }
    }

-    // 4. Metrics Pipeline
+    // 3. Direct Nginx Scraper
    prometheus.scrape "nginx_scraper" {
-      targets    = discovery.relabel.blog_filter.output
+      targets = discovery.relabel.blog_pods.output
+    
      forward_to = [prometheus.remote_write.metrics_service.receiver]
      job_name   = "integrations/nginx"
    }

+    // 4. Host Metrics
    prometheus.exporter.unix "host" {
      rootfs_path = "/host/root"
      sysfs_path  = "/host/sys"
@@ -103,15 +85,16 @@ data:
      forward_to = [prometheus.remote_write.metrics_service.receiver]
    }

+    // 5. Remote Write: Send to Prometheus
    prometheus.remote_write "metrics_service" {
      endpoint {
        url = sys.env("PROM_URL")
      }
    }

-    // 5. Logs Pipeline
+    // 6. Logs Pipeline: Send to Loki
    loki.source.kubernetes "pod_logs" {
-      targets    = discovery.relabel.k8s_labels.output
+      targets    = discovery.relabel.blog_pods.output
      forward_to = [loki.write.default.receiver]
    }

@@ -120,11 +103,8 @@ data:
        url = sys.env("LOKI_URL")
      }
    }
-
 ---
 # --- Agent Deployment (DaemonSet) ---
-# deploys one alloy agent per node to monitor the entire cluster.
-
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
@@ -142,10 +122,7 @@ spec:
      serviceAccountName: alloy-sa
      hostNetwork: true
      hostPID: true
-
-      # Forces the pod to use K8s CoreDNS even when running on host network
      dnsPolicy: ClusterFirstWithHostNet
-
      containers:
        - name: alloy
          image: grafana/alloy:latest
@@ -153,12 +130,10 @@ spec:
            - run
            - --server.http.listen-addr=0.0.0.0:12345
            - /etc/alloy/config.alloy
-
          envFrom:
            - configMapRef:
                name: monitoring-env
                optional: false
-
          volumeMounts:
            - name: config
              mountPath: /etc/alloy
--- a/namespaces.yaml
+++ b/namespaces.yaml
@@ -14,3 +14,9 @@ apiVersion: v1
 kind: Namespace
 metadata:
  name: kubernetes-dashboard
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: openebs
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -0,0 +1,102 @@
+# Severed-Infra: Health & Diagnostics Guide
+
+### 1. The Foundation: Node & Storage Stability
+
+Before troubleshooting apps, ensure the physical (Docker) layer is stable.
+
+* **Node Readiness:** All 3 nodes (1 server, 2 agents) must be `Ready`.
+
+```bash
+kubectl get nodes
+```
+
+* **Storage Binding:** Verify that the OpenEBS Persistent Volume Claims (PVCs) for Loki and Prometheus are `Bound`.
+
+```bash
+kubectl get pvc -n monitoring
+```
+
+[//]: # (todo add: kubectl get pods -n openebs)
+
+kubectl get pods -n severed-apps
+kubectl get pods -n monitoring
+kubectl get pods -n kubernetes-dashboard
+kubectl get pods -n openebs
+
+kubectl rollout restart deployment grafana -n monitoring
+
+---
+
+### 2. The Telemetry Bridge: Alloy & Exporter
+
+Check if Alloy is successfully translating raw Nginx text into Prometheus numbers.
+
+* **Error Scan:** Check Alloy logs specifically for `scrape_uri` or `connection refused` errors.
+
+```bash
+kubectl logs -n monitoring -l name=alloy --tail=50
+```
+
+[//]: # (kubectl apply -f infra/alloy-setup.yaml)
+[//]: # (kubectl delete pods -n monitoring -l name=alloy)
+[//]: # (kubectl get pods -n monitoring)
+[//]: # (kubectl describe pod alloy-dq2cd -n monitoring)
+[//]: # (kubectl logs -n monitoring -l name=alloy --tail=50)
+[//]: # (kubectl get pod -n monitoring -l app=grafana -o jsonpath='{.items[0].spec.containers[0].env}' | jq)
+
+[//]: # (kubectl rollout restart deployment severed-blog -n severed-apps)
+
+* **Internal Handshake:** Use your `access-hub.sh` script and visit `localhost:12345`.
+* Find the `prometheus.exporter.nginx.blog` component.
+* Ensure the health status is **Green/Up**.
+
+---
+
+### 3. The Database: Prometheus Query Test
+
+If the exporter is working, the metrics will appear in the Prometheus time-series database.
+
+* **Live Traffic Check:** Verify that `nginx_http_requests_total` is returning a data vector (not an empty list `[]`).
+
+```bash
+kubectl exec -it prometheus-0 -n monitoring -- \
+  wget -qO- "http://localhost:9090/api/v1/query?query=nginx_http_requests_total"
+
+```
+
+* **Metric Discovery:** List all Nginx-related metrics currently being stored.
+
+```bash
+kubectl exec -it prometheus-0 -n monitoring -- \
+  wget -qO- "http://localhost:9090/api/v1/label/__name__/values" | grep nginx
+
+```
+
+---
+
+### 4. The "Brain": Horizontal Pod Autoscaler (HPA)
+
+The HPA is the final consumer of this data. If this is healthy, the cluster is auto-scaling correctly.
+
+* **Target Alignment:** The `TARGETS` column should show a real value (e.g., `0/10`) rather than `<unknown>`.
+
+```bash
+kubectl get hpa -n severed-apps
+
+```
+
+* **Adapter Check:** Ensure the Custom Metrics API is serving the translated Nginx metrics to the Kubernetes master.
+
+```bash
+kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/severed-apps/pods/*/nginx_http_requests_total"
+
+```
+
+### Cheat Sheet
+
+| Symptom                    | Probable Cause              | Fix                                       |
+|----------------------------|-----------------------------|-------------------------------------------|
+| `502 Bad Gateway`          | Node resource exhaustion    | Restart K3d or increase Docker RAM        |
+| `strconv.ParseFloat` error | Missing Nginx Exporter      | Use `prometheus.exporter.nginx` in Alloy  |
+| HPA shows `<unknown>`      | Prometheus Adapter mismatch | Verify `adapter-values.yaml` metric names |
+| `No nodes found`           | Corrupted cluster state     | Run `k3d cluster delete` and recreate     |
--- a/scripts/access-hub.sh
+++ b/scripts/access-hub.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -e
+
+# Kill existing tunnels to prevent port conflicts
+killall kubectl 2>/dev/null
+
+# Kubernetes Dashboard (Kong Proxy)
+kubectl -n kubernetes-dashboard port-forward svc/kubernetes-dashboard-kong-proxy 8443:443 &
+
+# Alloy UI (Internal Health)
+kubectl -n monitoring port-forward ds/alloy 12345:12345 &
+
+# Grafana
+kubectl -n monitoring port-forward svc/grafana-service 3000:3000 &
+
+echo "Dashboard: https://localhost:8443"
+echo "Alloy UI:  http://localhost:12345"
+echo "Grafana:   http://localhost:3000"
+
+echo "Grafana:   http://grafana.localhost:8080"
+echo "Blog:      http://blog.localhost:8080"
+
+wait
--- a/scripts/deploy-all.sh
+++ b/scripts/deploy-all.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+set -e
+cd ..
+
+# 0. Environment Prep
+echo "Importing severed-blog:v0.3 into k3d..."
+k3d image import severed-blog:v0.3 -c severed-cluster
+
+# 1. Foundation
+kubectl apply -f namespaces.yaml
+
+echo "Installing OpenEBS LocalPV Provisioner..."
+kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml
+kubectl apply -f infra/storage/openebs-sc.yaml
+
+# 2. Monitoring Stack
+echo "Creating Grafana Secrets..."
+kubectl create secret generic grafana-secrets -n monitoring \
+  --from-literal=admin-user=admin \
+  --from-literal=admin-password=admin \
+  --dry-run=client -o yaml | kubectl apply -f -
+
+kubectl apply -f infra/observer/loki.yaml
+kubectl apply -f infra/observer/prometheus.yaml
+kubectl apply -f infra/alloy-env.yaml
+kubectl apply -f infra/alloy-setup.yaml
+
+# 3. Application Layer
+kubectl apply -f apps/severed-blog-config.yaml
+kubectl apply -f apps/severed-blog.yaml
+kubectl apply -f apps/severed-blog-service.yaml
+kubectl apply -f apps/severed-blog-hpa.yaml
+kubectl apply -f apps/severed-ingress.yaml
+
+# 4. Visualization and Scaling Bridge
+kubectl apply -f infra/observer/dashboard-json.yaml
+kubectl apply -f infra/observer/grafana.yaml
+
+echo "Installing Prometheus Adapter..."
+helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapter \
+  -n monitoring \
+  -f infra/observer/adapter-values.yaml
+
+# 5. Dashboard Setup
+kubectl apply -f infra/dashboard/dashboard-admin.yaml
+kubectl apply -f infra/dashboard/permanent-token.yaml
+
+echo "Deployment Complete. Retrieving Token..."
+kubectl -n kubernetes-dashboard get secret admin-user-token -o jsonpath={".data.token"} | base64 -d
+echo -e "\n"
--- a/scripts/setup-grafana-creds.sh
+++ b/scripts/setup-grafana-creds.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -e
+
+read NEW_PASS
+
+GRAFANA_POD=$(kubectl get pod -n monitoring -l app=grafana -o jsonpath='{.items[0].metadata.name}')
+
+if [ -z "$GRAFANA_POD" ]; then
+    echo "Error: Grafana pod not found."
+    exit 1
+fi
+
+echo "Setting Grafana admin password..."
+kubectl exec -it -n monitoring "$GRAFANA_POD" -- grafana-cli admin reset-admin-password "$NEW_PASS"