Skip to main content

Self-Host Prometheus + Grafana

·OSSAlt Team
prometheusgrafanamonitoringalertingself-hostingdocker2026

TL;DR

Prometheus (Apache 2.0, ~55K GitHub stars, Go) scrapes and stores time-series metrics. Grafana (AGPL 3.0, ~63K stars, TypeScript) visualizes them in dashboards. Together they're the industry-standard open source observability stack — used at Netflix, Cloudflare, and thousands of companies. This guide covers the full stack: Prometheus + Grafana + Alertmanager + node_exporter + cAdvisor for complete server and container monitoring.

Key Takeaways

  • Prometheus: Apache 2.0, ~55K stars — pull-based metric scraper, PromQL query language
  • Grafana: AGPL 3.0, ~63K stars — dashboard visualization, 50+ data sources
  • Alertmanager: Routes alerts to Slack, PagerDuty, email based on labels
  • node_exporter: Exposes Linux host metrics (CPU, memory, disk, network) for Prometheus
  • cAdvisor: Exposes Docker container metrics for Prometheus
  • vs Netdata: More customizable, more setup; Netdata is turnkey with less flexibility

Part 1: Full Stack Docker Compose

# docker-compose.yml
services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./prometheus/rules:/etc/prometheus/rules:ro
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=90d'   # Keep 90 days of data
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
      - '--web.enable-lifecycle'

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: unless-stopped
    ports:
      - "3000:3000"
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning:ro
    environment:
      GF_SECURITY_ADMIN_USER: admin
      GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_PASSWORD}"
      GF_USERS_ALLOW_SIGN_UP: "false"
      GF_SERVER_ROOT_URL: "https://grafana.yourdomain.com"
      GF_SERVER_DOMAIN: "grafana.yourdomain.com"
      GF_SMTP_ENABLED: "true"
      GF_SMTP_HOST: "smtp.yourdomain.com:587"
      GF_SMTP_USER: "${SMTP_USER}"
      GF_SMTP_PASSWORD: "${SMTP_PASS}"
      GF_SMTP_FROM_ADDRESS: "grafana@yourdomain.com"

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    restart: unless-stopped
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - alertmanager_data:/alertmanager

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node_exporter
    restart: unless-stopped
    pid: host
    network_mode: host
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    restart: unless-stopped
    privileged: true
    devices:
      - /dev/kmsg:/dev/kmsg
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
      - /cgroup:/cgroup:ro

volumes:
  prometheus_data:
  grafana_data:
  alertmanager_data:

Part 2: Prometheus Config

# prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

rule_files:
  - "rules/*.yml"

scrape_configs:
  # Prometheus self-monitoring:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # Host metrics:
  - job_name: 'node'
    static_configs:
      - targets: ['node-exporter:9100']
        labels:
          instance: 'server-1'

  # Docker container metrics:
  - job_name: 'cadvisor'
    static_configs:
      - targets: ['cadvisor:8080']

  # Additional servers (add your other hosts here):
  - job_name: 'remote-nodes'
    static_configs:
      - targets:
          - '192.168.1.10:9100'   # server-2 with node_exporter
          - '192.168.1.11:9100'   # server-3 with node_exporter
        labels:
          env: production

  # PostgreSQL (if running postgres_exporter):
  - job_name: 'postgresql'
    static_configs:
      - targets: ['postgres-exporter:9187']

  # Redis (if running redis_exporter):
  - job_name: 'redis'
    static_configs:
      - targets: ['redis-exporter:9121']

Part 3: HTTPS with Caddy

grafana.yourdomain.com {
    reverse_proxy localhost:3000
}

prometheus.yourdomain.com {
    # Restrict Prometheus to internal access only:
    @external not remote_ip 192.168.0.0/16 10.0.0.0/8
    respond @external 403
    reverse_proxy localhost:9090
}

Part 4: Alert Rules

# prometheus/rules/host.yml
groups:
  - name: host_alerts
    rules:
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is {{ $value | humanize }}%"

      - alert: DiskSpaceLow
        expr: (1 - node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk space low on {{ $labels.instance }}"
          description: "{{ $labels.mountpoint }} is {{ $value | humanize }}% full"

      - alert: HighMemoryUsage
        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"

      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"

  - name: container_alerts
    rules:
      - alert: ContainerHighCPU
        expr: sum(rate(container_cpu_usage_seconds_total{name!=""}[5m])) by (name) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high CPU usage"

Part 5: Alertmanager Config

# alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m
  slack_api_url: "${SLACK_WEBHOOK_URL}"

route:
  group_by: ['alertname', 'instance']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h
  receiver: 'slack-default'
  routes:
    - match:
        severity: critical
      receiver: 'slack-critical'
      repeat_interval: 1h

receivers:
  - name: 'slack-default'
    slack_configs:
      - channel: '#alerts'
        title: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

  - name: 'slack-critical'
    slack_configs:
      - channel: '#alerts-critical'
        title: '🔴 CRITICAL: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
    pagerduty_configs:
      - routing_key: "${PAGERDUTY_KEY}"

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']

Part 6: Grafana Dashboards

Import community dashboards

  1. Grafana → +Import
  2. Enter dashboard ID from grafana.com/dashboards:
    • 1860 — Node Exporter Full (host metrics)
    • 893 — Docker and system monitoring
    • 9628 — PostgreSQL Database
    • 11835 — Redis Exporter Dashboard
    • 7362 — Cadvisor Docker metrics

Auto-provision dashboards

# grafana/provisioning/datasources/prometheus.yml
apiVersion: 1
datasources:
  - name: Prometheus
    type: prometheus
    url: http://prometheus:9090
    isDefault: true
    access: proxy
# grafana/provisioning/dashboards/default.yml
apiVersion: 1
providers:
  - name: 'default'
    folder: ''
    type: file
    options:
      path: /etc/grafana/provisioning/dashboards
# Place dashboard JSON files in:
grafana/provisioning/dashboards/
  node-exporter.json   ← Auto-loaded on startup
  docker.json

Part 7: PromQL Quick Reference

# CPU usage by instance (last 5min average):
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# Memory usage %:
(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100

# Disk usage by mount:
(1 - node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes) * 100

# HTTP request rate (if you expose an app with /metrics):
rate(http_requests_total[5m])

# 95th percentile request duration:
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

# Container memory usage:
container_memory_usage_bytes{name!=""}

# Container CPU rate:
sum(rate(container_cpu_usage_seconds_total{name!=""}[5m])) by (name)

Maintenance

# Update stack:
docker compose pull
docker compose up -d

# Reload Prometheus config (no restart):
curl -X POST http://localhost:9090/-/reload

# Backup Prometheus data:
tar -czf prometheus-backup-$(date +%Y%m%d).tar.gz \
  $(docker volume inspect prometheus_prometheus_data --format '{{.Mountpoint}}')

# Backup Grafana (dashboards + settings):
tar -czf grafana-backup-$(date +%Y%m%d).tar.gz \
  $(docker volume inspect prometheus_grafana_data --format '{{.Mountpoint}}')

# Check active alerts:
curl http://localhost:9090/api/v1/alerts | jq '.data.alerts[]'

See all open source monitoring tools at OSSAlt.com/categories/devops.

Comments