Skip to content
Snippets Groups Projects
Commit 5477b936 authored by Sergey Yakubov's avatar Sergey Yakubov
Browse files

configure more alerts

parent d32333a1
No related branches found
No related tags found
2 merge requests!170Feature ASAPO-150 asapo status monitoring,!175Develop
......@@ -2,6 +2,8 @@ elk_logs = false
asapo_monitor = true
asapo_monitor_alert = true
asapo_alert_email = "xxx"
asapo_alert_email_smart_host = "xxx:25"
receiver_total_memory_size = 500
receiver_dataserver_cache_size = 0 #gb
......
groups:
- name: prometheus_alerts
rules:
- name: asapo-nomad-alerts
rules:
- alert: asapo-services
expr: sum(nomad_nomad_job_summary_running{exported_job="asapo-services"}) < 2 or absent(nomad_nomad_job_summary_running{exported_job="asapo-services"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-monitoring
expr: sum(nomad_nomad_job_summary_running{exported_job="asapo-monitoring"}) < 2 or absent(nomad_nomad_job_summary_running{exported_job="asapo-monitoring"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-mongo
expr: nomad_nomad_job_summary_running{exported_job="asapo-mongo"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-mongo"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-receivers-incomplete
expr: (nomad_nomad_job_summary_running{exported_job="asapo-receivers"} < {{ env "NOMAD_META_n_receivers" }} and sum (nomad_nomad_job_summary_running{exported_job="asapo-receivers"}) > 0) or absent(nomad_nomad_job_summary_running{exported_job="asapo-receivers"})
for: 30s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-receivers-absent
expr: nomad_nomad_job_summary_running{exported_job="asapo-receivers"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-receivers"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-nginx
expr: nomad_nomad_job_summary_running{exported_job="asapo-nginx"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-nginx"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-fluentd
expr: nomad_nomad_job_summary_running{exported_job="asapo-logging", task_group="fluentd"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-logging", task_group="fluentd"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- name: asapo-nomad-alerts
rules:
- alert: asapo-services
expr: sum(nomad_nomad_job_summary_running{exported_job="asapo-services"}) < 2 or absent(nomad_nomad_job_summary_running{exported_job="asapo-services"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-monitoring
expr: sum(nomad_nomad_job_summary_running{exported_job="asapo-monitoring"}) < 2 or absent(nomad_nomad_job_summary_running{exported_job="asapo-monitoring"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-mongo
expr: nomad_nomad_job_summary_running{exported_job="asapo-mongo"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-mongo"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-receivers-incomplete
expr: (nomad_nomad_job_summary_running{exported_job="asapo-receivers"} < {{ env "NOMAD_META_n_receivers" }} and sum (nomad_nomad_job_summary_running{exported_job="asapo-receivers"}) > 0) or absent(nomad_nomad_job_summary_running{exported_job="asapo-receivers"})
for: 60s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-receivers-absent
expr: nomad_nomad_job_summary_running{exported_job="asapo-receivers"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-receivers"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-nginx
expr: nomad_nomad_job_summary_running{exported_job="asapo-nginx"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-nginx"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-fts-incomplete
expr: (nomad_nomad_job_summary_running{exported_job="asapo-file-transfer"} < {{ env "NOMAD_META_n_fts" }} and sum (nomad_nomad_job_summary_running{exported_job="asapo-file-transfer"}) > 0) or absent(nomad_nomad_job_summary_running{exported_job="asapo-file-transfer"})
for: 60s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-fts-absent
expr: nomad_nomad_job_summary_running{exported_job="asapo-file-transfer"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-file-transfer"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-brokers-incomplete
expr: (nomad_nomad_job_summary_running{exported_job="asapo-brokers"} < {{ env "NOMAD_META_n_brokers" }} and sum (nomad_nomad_job_summary_running{exported_job="asapo-brokers"}) > 0) or absent(nomad_nomad_job_summary_running{exported_job="asapo-brokers"})
for: 60s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-brokers-absent
expr: nomad_nomad_job_summary_running{exported_job="asapo-brokers"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-brokers"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-fluentd
expr: nomad_nomad_job_summary_running{exported_job="asapo-logging", task_group="fluentd"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-logging", task_group="fluentd"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- name: asapo-consul-alerts
rules:
- alert: asapo-discovery
expr: sum (up{job="asapo-discovery"}) < 1 or absent(up{job="asapo-discovery"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-brokers-incomplete
expr: (sum (up{job="asapo-broker"}) < {{ env "NOMAD_META_n_brokers" }} and sum (up{job="asapo-broker"}) > 0) or absent(up{job="asapo-broker"})
for: 30s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-brokers-absent
expr: sum (up{job="asapo-broker"}) == 0 or absent(up{job="asapo-broker"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-receivers-incomplete
expr: (sum (up{job="asapo-receiver"}) < {{ env "NOMAD_META_n_receivers" }} and sum (up{job="asapo-receiver"}) > 0) or absent(up{job="asapo-receiver"})
for: 30s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-receivers-absent
expr: sum (up{job="asapo-receiver"}) == 0 or absent(up{job="asapo-receiver"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- name: asapo-consul-alerts
rules:
- alert: asapo-discovery
expr: sum (up{job="asapo-discovery"}) < 1 or absent(up{job="asapo-discovery"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-mongodb-monitor
expr: sum (up{job="asapo-mongodb-monitor"}) < 1 or absent(up{job="asapo-mongodb-monitor"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-brokers-incomplete
expr: (sum (up{job="asapo-broker"}) < {{ env "NOMAD_META_n_brokers" }} and sum (up{job="asapo-broker"}) > 0) or absent(up{job="asapo-broker"})
for: 60s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-brokers-absent
expr: sum (up{job="asapo-broker"}) == 0 or absent(up{job="asapo-broker"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-receivers-incomplete
expr: (sum (up{job="asapo-receiver"}) < {{ env "NOMAD_META_n_receivers" }} and sum (up{job="asapo-receiver"}) > 0) or absent(up{job="asapo-receiver"})
for: 60s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-receivers-absent
expr: sum (up{job="asapo-receiver"}) == 0 or absent(up{job="asapo-receiver"})
for: 10s
labels:
severity: fatal
group: asapo-cluster
......@@ -22,6 +22,52 @@ job "asapo-mongo" {
mode = "delay"
}
network {
port "mongo" {
static = "${mongo_port}"
}
port "mongo_monitor" {
to = 9216
}
}
task "mongo-monitor" {
lifecycle {
hook = "poststart"
sidecar = true
}
driver = "docker"
user = "${asapo_user}"
config {
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "yakser/mongodb-exporter"
args = [
"--mongodb.uri=mongodb://$${NOMAD_ADDR_mongo}"
]
ports = ["mongo_monitor"]
}
service {
port = "mongo_monitor"
name = "asapo-mongodb-monitor"
check {
name = "alive"
type = "http"
path = "/"
interval = "10s"
timeout = "1s"
}
check_restart {
limit = 2
grace = "6000s"
ignore_warnings = false
}
}
}
task "mongo" {
driver = "docker"
user = "${asapo_user}"
......@@ -36,11 +82,6 @@ job "asapo-mongo" {
resources {
memory = "${mongo_total_memory_size}"
network {
port "mongo" {
static = "${mongo_port}"
}
}
}
service {
......
......@@ -122,6 +122,7 @@ job "asapo-monitoring" {
meta {
n_brokers = "${n_brokers}"
n_receivers = "${n_receivers}"
n_fts = "${n_fts}"
}
service {
name = "prometheus"
......
......@@ -20,7 +20,6 @@ rule_files:
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
consul_sd_configs:
- server: '{{ env "NOMAD_IP_prometheus_ui" }}:8500'
......@@ -38,19 +37,13 @@ scrape_configs:
metrics_path: /v1/metrics
params:
format: ['prometheus']
- job_name: discovery
- job_name: asapo
consul_sd_configs:
- server: '{{ env "NOMAD_IP_prometheus_ui" }}:8500'
services:
- 'asapo-discovery'
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: job
- job_name: broker
consul_sd_configs:
- server: '{{ env "NOMAD_IP_prometheus_ui" }}:8500'
services:
- 'asapo-broker'
- 'asapo-mongodb-monitor'
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: job
......
......@@ -109,6 +109,7 @@ data "template_file" "asapo_monitoring" {
vars = {
n_brokers = "${var.n_brokers}"
n_receivers = "${var.n_receivers}"
n_fts = "${var.n_fts}"
service_dir = "${var.service_dir}"
scripts_dir = "${var.job_scripts_dir}"
asapo_monitor = "${var.asapo_monitor}"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment