{"count":8,"playbooks":[{"id":"disk_cleanup","name":"Disk Full Cleanup","description":"Monitors local disk usage and automatically clears old logs, temp files, and Docker artifacts when disk exceeds threshold.","trigger":"Local disk usage > 92%","incident_type":"disk","actions":["rotate_logs","clear_tmp","docker_prune","verify"],"schedule":"every 15 minutes"},{"id":"memory_pressure","name":"Memory Pressure Auto-Restart","description":"Monitors system memory pressure and restarts runaway processes when memory exceeds threshold. Captures diagnostic snapshot and verifies recovery.","trigger":"System memory pressure > 85%","incident_type":"memory","actions":["capture_diagnostics","graceful_restart","verify_recovery"],"schedule":"every 15 minutes"},{"id":"health_check","name":"Failing Health Check Auto-Restart","description":"Polls the service /health endpoint every 30 seconds. After 3 consecutive failures, captures a diagnostic snapshot, issues a graceful service restart, and verifies recovery with a post-restart health probe.","trigger":"3 consecutive /health failures (503 or timeout)","incident_type":"health","actions":["capture_diagnostics","graceful_restart","verify_recovery"],"schedule":"every 30 seconds"},{"id":"ssl_expiry","name":"SSL Cert Expiry Auto-Renewal","description":"Scans configured domains daily. Flags certs expiring in <14 days (warn) or <3 days (remediate). Renews via Let's Encrypt / certbot, reloads the web server gracefully, and verifies the new cert is live.","trigger":"SSL cert expires in <3 days","incident_type":"ssl","actions":["check_certbot_config","trigger_renewal","reload_web_server","verify_cert_serving"],"schedule":"every 24 hours"},{"id":"dbpool_exhaustion","name":"Postgres Connection Pool Auto-Recovery","description":"Monitors active Postgres connections via pg_stat_activity. At >80% warns; at >92% runs 4-step recovery: snapshots connections, terminates idle-in-transaction backends older than 5 minutes via pg_terminate_backend, verifies pool drops below 70%, and emits a sizing recommendation if utilization remains elevated.","trigger":"Postgres pool utilization > 92%","incident_type":"dbpool","actions":["capture_pool_snapshot","terminate_idle_backends","verify_pool_recovery","emit_recommendation"],"schedule":"every 5 minutes"},{"id":"ratelimit_429","name":"API Rate-Limit Auto-Recovery","description":"Monitors outbound HTTP request metrics for sustained 429 responses from upstream APIs. When >5% of requests to a single upstream return 429 over a 60s rolling window, applies 50% concurrency reduction with exponential backoff + full jitter. If 429 rate stays >2% after 60s, trips a circuit breaker (fail-fast 30s, then half-open probe). Verifies sub-1% 429 rate sustained 30s, then emits a config diff recommendation.","trigger":">5% of requests to an upstream returning 429 over 60s window","incident_type":"ratelimit","actions":["snapshot_upstream_metrics","apply_backoff","circuit_breaker_eval","verify_recovery","emit_recommendation"],"schedule":"every 5 minutes"},{"id":"k8s_crashloop","name":"Kubernetes CrashLoopBackOff Auto-Recovery","description":"Polls kubectl get pods --all-namespaces for CrashLoopBackOff state. On >=3 restarts within 5 minutes, captures last 100 log lines, describes the pod, and classifies into OOMKilled (137), ImagePullBackOff, Missing ConfigMap/Secret, Liveness probe failure, or App crash. Remediates per class: OOMKilled patches deployment memory +50% (cap 4Gi) and does rollout restart; ImagePullBackOff verifies tag and restarts; Missing ConfigMap/Secret re-applies from git; Liveness probe logs config and restarts pod; App crash escalates to operator. Verifies Ready=1/1 and stable 90s.","trigger":"Pod in CrashLoopBackOff with >=3 restarts in 5-minute window","incident_type":"k8s_crashloop","actions":["fetch_pod_logs","describe_pod","patch_deployment_resources","rollout_restart","verify_pod_ready"],"schedule":"every 5 minutes"},{"id":"ssl_cert_renewal","name":"SSL/TLS Certificate Auto-Renewal (Let's Encrypt + cert-manager)","description":"Monitors cert expiry for configured domains. Within 7 days of expiry, classifies the renewal blocker: ACME rate limit, DNS challenge failure, ACME endpoint down, or manually-managed cert. Remediation tree: retry HTTP-01 ACME challenge → fallback to DNS-01 challenge → escalate to operator if cert is manually managed. Reloads nginx/caddy gracefully on cert swap. Verifies new cert is serving with correct chain. References: Fly.io Oct 2024 outage, GitHub cert expiry 2020, Microsoft Teams 2020.","trigger":"SSL/TLS cert expires in ≤7 days OR last ACME renewal attempt failed","incident_type":"ssl_cert_expiring","actions":["detect_cert_expiry","acme_http01_retry","acme_dns01_fallback","reload_web_server","verify_cert_serving"],"schedule":"every 12 hours"}],"webhooks":{"supported_channels":["slack","discord","generic","pagerduty"],"endpoint":"/api/webhooks","test_endpoint":"GET /webhooks/test?url=...&channel=...","description":"Every playbook run fires a structured outbound webhook to Slack, Discord, or any generic endpoint. PagerDuty Events API v2 shape supported."},"_note":"Cloud playbooks (AWS/GCP/Azure) coming soon."}