Self-Host Prometheus + Grafana
·OSSAlt Team
prometheusgrafanamonitoringalertingself-hostingdocker2026
TL;DR
Prometheus (Apache 2.0, ~55K GitHub stars, Go) scrapes and stores time-series metrics. Grafana (AGPL 3.0, ~63K stars, TypeScript) visualizes them in dashboards. Together they're the industry-standard open source observability stack — used at Netflix, Cloudflare, and thousands of companies. This guide covers the full stack: Prometheus + Grafana + Alertmanager + node_exporter + cAdvisor for complete server and container monitoring.
Key Takeaways
- Prometheus: Apache 2.0, ~55K stars — pull-based metric scraper, PromQL query language
- Grafana: AGPL 3.0, ~63K stars — dashboard visualization, 50+ data sources
- Alertmanager: Routes alerts to Slack, PagerDuty, email based on labels
- node_exporter: Exposes Linux host metrics (CPU, memory, disk, network) for Prometheus
- cAdvisor: Exposes Docker container metrics for Prometheus
- vs Netdata: More customizable, more setup; Netdata is turnkey with less flexibility
Part 1: Full Stack Docker Compose
# docker-compose.yml
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus/rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=90d' # Keep 90 days of data
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_PASSWORD}"
GF_USERS_ALLOW_SIGN_UP: "false"
GF_SERVER_ROOT_URL: "https://grafana.yourdomain.com"
GF_SERVER_DOMAIN: "grafana.yourdomain.com"
GF_SMTP_ENABLED: "true"
GF_SMTP_HOST: "smtp.yourdomain.com:587"
GF_SMTP_USER: "${SMTP_USER}"
GF_SMTP_PASSWORD: "${SMTP_PASS}"
GF_SMTP_FROM_ADDRESS: "grafana@yourdomain.com"
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
node-exporter:
image: prom/node-exporter:latest
container_name: node_exporter
restart: unless-stopped
pid: host
network_mode: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
restart: unless-stopped
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /cgroup:/cgroup:ro
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
Part 2: Prometheus Config
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- "rules/*.yml"
scrape_configs:
# Prometheus self-monitoring:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Host metrics:
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
labels:
instance: 'server-1'
# Docker container metrics:
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
# Additional servers (add your other hosts here):
- job_name: 'remote-nodes'
static_configs:
- targets:
- '192.168.1.10:9100' # server-2 with node_exporter
- '192.168.1.11:9100' # server-3 with node_exporter
labels:
env: production
# PostgreSQL (if running postgres_exporter):
- job_name: 'postgresql'
static_configs:
- targets: ['postgres-exporter:9187']
# Redis (if running redis_exporter):
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
Part 3: HTTPS with Caddy
grafana.yourdomain.com {
reverse_proxy localhost:3000
}
prometheus.yourdomain.com {
# Restrict Prometheus to internal access only:
@external not remote_ip 192.168.0.0/16 10.0.0.0/8
respond @external 403
reverse_proxy localhost:9090
}
Part 4: Alert Rules
# prometheus/rules/host.yml
groups:
- name: host_alerts
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value | humanize }}%"
- alert: DiskSpaceLow
expr: (1 - node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "{{ $labels.mountpoint }} is {{ $value | humanize }}% full"
- alert: HighMemoryUsage
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
- name: container_alerts
rules:
- alert: ContainerHighCPU
expr: sum(rate(container_cpu_usage_seconds_total{name!=""}[5m])) by (name) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high CPU usage"
Part 5: Alertmanager Config
# alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
slack_api_url: "${SLACK_WEBHOOK_URL}"
route:
group_by: ['alertname', 'instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'slack-default'
routes:
- match:
severity: critical
receiver: 'slack-critical'
repeat_interval: 1h
receivers:
- name: 'slack-default'
slack_configs:
- channel: '#alerts'
title: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'slack-critical'
slack_configs:
- channel: '#alerts-critical'
title: '🔴 CRITICAL: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
pagerduty_configs:
- routing_key: "${PAGERDUTY_KEY}"
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
Part 6: Grafana Dashboards
Import community dashboards
- Grafana → + → Import
- Enter dashboard ID from grafana.com/dashboards:
- 1860 — Node Exporter Full (host metrics)
- 893 — Docker and system monitoring
- 9628 — PostgreSQL Database
- 11835 — Redis Exporter Dashboard
- 7362 — Cadvisor Docker metrics
Auto-provision dashboards
# grafana/provisioning/datasources/prometheus.yml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
url: http://prometheus:9090
isDefault: true
access: proxy
# grafana/provisioning/dashboards/default.yml
apiVersion: 1
providers:
- name: 'default'
folder: ''
type: file
options:
path: /etc/grafana/provisioning/dashboards
# Place dashboard JSON files in:
grafana/provisioning/dashboards/
node-exporter.json ← Auto-loaded on startup
docker.json
Part 7: PromQL Quick Reference
# CPU usage by instance (last 5min average):
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory usage %:
(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
# Disk usage by mount:
(1 - node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes) * 100
# HTTP request rate (if you expose an app with /metrics):
rate(http_requests_total[5m])
# 95th percentile request duration:
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# Container memory usage:
container_memory_usage_bytes{name!=""}
# Container CPU rate:
sum(rate(container_cpu_usage_seconds_total{name!=""}[5m])) by (name)
Maintenance
# Update stack:
docker compose pull
docker compose up -d
# Reload Prometheus config (no restart):
curl -X POST http://localhost:9090/-/reload
# Backup Prometheus data:
tar -czf prometheus-backup-$(date +%Y%m%d).tar.gz \
$(docker volume inspect prometheus_prometheus_data --format '{{.Mountpoint}}')
# Backup Grafana (dashboards + settings):
tar -czf grafana-backup-$(date +%Y%m%d).tar.gz \
$(docker volume inspect prometheus_grafana_data --format '{{.Mountpoint}}')
# Check active alerts:
curl http://localhost:9090/api/v1/alerts | jq '.data.alerts[]'
See all open source monitoring tools at OSSAlt.com/categories/devops.