Monitoring

InfluxDB

Documentaion

Install

# docker
docker run \
    --name influxdb \
    -p 8086:8086 \
    -v $PWD/config.yml:/etc/influxdb2/config.yml \
    -v /data/influxdb:/var/lib/influxdb2 \
    influxdb:2.2.0

Setup

docker exec influxdb influx setup \
      --username {username} \
      --password {password} \
      --org {org} \
      --bucket default \
      --retention 8760h \
      --force

Query

from(bucket:"example-bucket")
  |> range(start: -15m)
  |> filter(fn: (r) =>
    r._measurement == "cpu" and
    r._field == "usage_system" and
    r.cpu == "cpu-total"
  )

Prometheus

Documentaion

安装Node Exporter

在要监控的机器上安装 Node Exporter

wget https://github.com/prometheus/node_exporter/releases/download/v*/node_exporter-*.*-amd64.tar.gz
tar xvfz node_exporter-*.*-amd64.tar.gz
cd node_exporter-*.*-amd64
./node_exporter

测试是否安装成功

curl http://localhost:9100/metrics

配置Prometheus

# 全局配置
global:
  scrape_interval: 1m # 拉取数据的周期
  evaluation_interval: 1m # 计算rules的周期 

# 拉取配置
scrape_configs:
  # 监控自己
  - job_name: 'prometheus'
    static_configs:
    - targets: ['localhost:9090']

  # 监控配置的节点
  - job_name: node
    static_configs:
    - targets: ['localhost:9100']

# 导入rules
rule_files
  - "rules_1.yml"
  - "rules_2.yml"

# Alertmanager配置
alerting:
  alertnabagers:
  - static_configs:
    - targets:
      - alertmanager:9093

配置帐号密码

# 创建密码并用 bcrypt 进行hash
htpasswd -nBC 10 "" | tr -d ':\n'

将以下内容写入web-config.yml

# Usernames and passwords required to connect to Prometheus.
# # Passwords are hashed with bcrypt: https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md#about-bcrypt
basic_auth_users:
  your_username: 'your_password_hashed'

启动Prometheus

./prometheus --config.file=./prometheus.yml --web.config.file=./web-config.yml

或者docker启动

docker run -d \
    --name prometheus \
    -p 9090:9090 \
    -v /path/to/prometheus.yml:/etc/prometheus/prometheus.yml \
    -v /path/to/web-config.yml:/etc/prometheus/web-config.yml \
    -v prometheus-volume:/prometheus \
    prom/prometheus:v2.26.0

配置Nginx转发

location /prometheus/ {
    proxy_pass http://127.0.0.1:9090/;
}

启动时加上配置

./prometheus \
  --config.file=./prometheus.yml \
  --web.config.file=./web-config.yml \
  --web.external-url=https://cyzlmh.xyz/prometheus \
  --web.route-prefix=/

HTTP API

# 查看所有标签
curl "http://localhost:9090/api/v1/labels"

# 查看指定标签所有值
curl "http://localhost:9090/api/v1/label/{label_name}/values"

# 查看所有series
curl "http://localhost:9090/api/v1/label/__name__/values"

# 查看过滤后的series
curl "http://localhost:9090/api/v1/series?" --data-urlencode "match[]=node_load1"

# query
curl "http://localhost:9090/api/v1/query?query=node_load1"

# range query
curl "http://localhost:9090/api/v1/query_range?query=up&start=2015-07-01T20:10:30.781Z&end=2015-07-01T20:11:00.781Z&step=15s"

# delete series
curl -X POST -g 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]=up&match[]=process_start_time_seconds{job="prometheus"}'

AlertManager

  • Grouping:将类似的告警合并,通过routing实现
  • Inhibition:抑制不必要的告警,例如若某告警已经触发,则不触发某其他告警。通过配置文件配置匹配规则
  • Silence:将告警抑制一段时间,通过规则匹配

告警规则范例

Awesome Prometheus alerts - Collection of alerting rules

Loki

Documentaion

Install

docker run -d \
    --name loki \
    -v $(pwd):/mnt/config \
    -v loki-volume:/loki \
    -p 3100:3100 \
    grafana/loki:2.2.0 \
    -config.file=/mnt/config/loki-config.yaml

HTTP API

Query

curl -G -s  "http://localhost:3100/loki/api/v1/query_range" \
--data-urlencode 'query=sum(rate({job="varlogs"}[10m])) by (level)' \
--data-urlencode 'step=300'

curl -G -s  "http://localhost:3100/loki/api/v1/query_range" \
--data-urlencode 'query={job="varlogs"}'

Push

curl -v -H "Content-Type: application/json" -XPOST \
-s "http://localhost:3100/loki/api/v1/push" \
--data-raw '{"streams":[{"stream":{"foo":"bar2"},"values":[["1618192957125000000", "fizzbuzz"]]}]}'

Grafana

Documentaion

Install

# docker
docker run -d -p 3000:3000 --name grafana grafana/grafana:lastest

# 允许嵌入其他页面,否则会跨域请求报错
docker run -d \
    -p 3000:3000 \
    --name grafana \
    -v grafana-volume:/var/lib/grafana \
    -e GF_SECURITY_ALLOW_EMBEDDING=true \
    grafana/grafana:7.5.3

启动参数

  • 账号密码
    • -e "GF_SECURITY_ADMIN_USER={your_username}"
    • -e "GF_SECURITY_ADMIN_PASSWORD={your_password}"
  • 允许嵌入其他页面,否则会跨域请求报错
    • -e GF_SECURITY_ALLOW_EMBEDDING=true
  • 告警邮件
    • -e "GF_SMTP_HOST=smtp.163.com:25" -e "GF_SMTP_USER=cyzlmh" -e "GF_SMTP_PASSWORD={your_password}" -e "GF_SMTP_FROM_ADDRESS={your_address}" -e "GF_SMTP_FROM_NAME={your_name}"

Nginx 转发

location /grafana/ {
    proxy_pass http://127.0.0.1:3000/;
}

启动参数添加

  • -e "GF_SERVER_ROOT_URL=https://{your_host}/grafana/"

仪表盘

模版:https://grafana.com/grafana/dashboards/

使用技巧

Vector

Documentaion

Telegraf

Documentaion

Install

docker pull telegraf:1.18.3

# 创建配置文件
docker run -it telegraf:1.18.3 telegraf config > telegraf.conf

# bin 启动
/usr/bin/telegraf -config /etc/telegraf/telegraf.conf -config-directory /etc/telegraf/telegraf.d

# systemctl 启动
sudo cp telegraf.conf /etc/telegraf/telegraf.conf
sudo systemctl restart telegraf
sudo systemctl status telegraf

Docker

添加docker权限

sudo gpasswd -a telegraf docker

配置

# global
[global_tags]
  node = "devcloud"
  collector = "telegraf"

[agent]
  interval = "60s"
  round_interval = true
  metric_batch_size = 1000
  metric_buffer_limit = 10000
  collection_jitter = "0s"
  flush_interval = "10s"
  flush_jitter = "0s"
  precision = ""
  hostname = ""
  omit_hostname = false
  debug = true
  quiet = false
  logtarget = "file"
  logfile = "/data/yzchen/Softwares/telegraf/telegraf.logs"

# plugins
[[inputs.cpu]]
  percpu = false
  totalcpu = true
  collect_cpu_time = false
  report_active = false
  [inputs.cpu.tags]
    input = "cpu"

[[inputs.mem]]
  [inputs.mem.tags]
    input = "memory"

[[inputs.disk]]
  ignore_fs = ["udev", "tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
  [inputs.disk.tags]
    input = "disk"

[[inputs.net]]
  interfaces = ["eth*"]
  ignore_protocol_stats = true

[[inputs.docker]]
  endpoint = "unix:///var/run/docker.sock"
  gather_services = false
  source_tag = false
  container_names = []
  perdevice = false
  total = true
  [inputs.docker.tags]
    input = "docker"

[[inputs.procstat]]
  pattern = "elasticsearch/jdk/bin/java"
  [inputs.procstat.tags]
    input = "procstat"
    process = "elasticsearch"

[[inputs.procstat]]
  pattern = "kibana"
  [inputs.procstat.tags]
    input = "procstat"
    process = "kibana"

# ouput
[[outputs.prometheus_client]]
  listen = ":9273"

cAdvisor

Run with Docker

VERSION=v0.49.1
sudo docker run \
  --volume=/:/rootfs:ro \
  --volume=/var/run:/var/run:ro \
  --volume=/sys:/sys:ro \
  --volume=/var/lib/docker/:/var/lib/docker:ro \
  --volume=/dev/disk/:/dev/disk:ro \
  --publish 8080:8080 \
  --detach=true \
  --name=cadvisor \
  --privileged \
  --device=/dev/kmsg \
  ccr.ccs.tencentyun.com/yzchen/cadvisor:$VERSION

在 Prometheus 中加上配置

rule_files:
  - job_name: 'cadvisor'
    static_configs:
    - targets: ['localhost:8080']

在 Grafana 中导入 dashboard 模板:Cadvisor exporter