Skip to content
Snippets Groups Projects
Commit 69fe6eb4 authored by Sergey Yakubov's avatar Sergey Yakubov
Browse files

add metrics endpoint to broker, update alerts

parent 0380b7bb
No related branches found
No related tags found
No related merge requests found
......@@ -9,6 +9,7 @@ require (
github.com/blastrain/vitess-sqlparser v0.0.0-20201030050434-a139afbb1aba
github.com/gorilla/mux v1.8.0
github.com/influxdata/influxdb1-client v0.0.0-20200827194710-b269163b24ab
github.com/prometheus/client_golang v1.11.0
github.com/rs/xid v1.2.1
github.com/stretchr/testify v1.7.0
go.mongodb.org/mongo-driver v1.5.3
......
This diff is collapsed.
......@@ -6,6 +6,7 @@ import (
log "asapo_common/logger"
"asapo_common/utils"
"errors"
"github.com/prometheus/client_golang/prometheus/promhttp"
"net/http"
_ "net/http/pprof"
"strconv"
......@@ -24,6 +25,8 @@ func Start() {
}
mux := utils.NewRouter(listRoutes)
mux.PathPrefix("/debug/pprof/").Handler(http.DefaultServeMux)
mux.PathPrefix("/metrics").Handler(promhttp.Handler())
log.Info("Listening on port: " + strconv.Itoa(settings.Port))
log.Fatal(http.ListenAndServe(":"+strconv.Itoa(settings.Port), http.HandlerFunc(mux.ServeHTTP)))
}
......
......@@ -8,36 +8,43 @@ groups:
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-monitoring
expr: sum(nomad_nomad_job_summary_running{exported_job="asapo-monitoring"}) < 2 or absent(nomad_nomad_job_summary_running{exported_job="asapo-monitoring"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-mongo
expr: nomad_nomad_job_summary_running{exported_job="asapo-mongo"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-mongo"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-receivers-incomplete
expr: (nomad_nomad_job_summary_running{exported_job="asapo-receivers"} < {{ env "NOMAD_META_n_receivers" }} and sum (nomad_nomad_job_summary_running{exported_job="asapo-receivers"}) > 0) or absent(nomad_nomad_job_summary_running{exported_job="asapo-receivers"})
for: 30s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-receivers-absent
expr: nomad_nomad_job_summary_running{exported_job="asapo-receivers"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-receivers"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-nginx
expr: nomad_nomad_job_summary_running{exported_job="asapo-nginx"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-nginx"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-fluentd
expr: nomad_nomad_job_summary_running{exported_job="asapo-logging", task_group="fluentd"} < 1 or absent(nomad_nomad_job_summary_running{exported_job="asapo-logging", task_group="fluentd"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
- name: asapo-consul-alerts
rules:
......@@ -46,13 +53,28 @@ groups:
for: 1s
labels:
severity: fatal
group: asapo-cluster
- alert: asapo-brokers-incomplete
expr: (sum (up{job="asapo-broker"}) < {{ env "NOMAD_META_n_brokers" }} and sum (up{job="asapo-broker"}) > 0) or absent(up{job="asapo-broker"})
for: 30s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-brokers-absent
expr: sum (up{job="asapo-broker"}) == 0 or absent(up{job="asapo-broker"})
for: 1s
labels:
severity: fatal
\ No newline at end of file
severity: fatal
group: asapo-cluster
- alert: asapo-receivers-incomplete
expr: (sum (up{job="asapo-receiver"}) < {{ env "NOMAD_META_n_receivers" }} and sum (up{job="asapo-receiver"}) > 0) or absent(up{job="asapo-receiver"})
for: 30s
labels:
severity: warn
group: asapo-cluster
- alert: asapo-receivers-absent
expr: sum (up{job="asapo-receiver"}) == 0 or absent(up{job="asapo-receiver"})
for: 1s
labels:
severity: fatal
group: asapo-cluster
route:
group_by: ['group','severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- smarthost: '{{ env "NOMAD_META_email_smart_host" }}'
to: '{{ env "NOMAD_META_alert_email" }}'
from: 'noreply@desy.de'
send_resolved: true
require_tls: false
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
......@@ -34,9 +34,15 @@ job "asapo-monitoring" {
"--storage.path=/alertmanager"
]
volumes = [
"/${service_dir}/alertmanager:/alertmanager"
"/${service_dir}/alertmanager:/alertmanager",
"local/alertmanager.yml:/etc/alertmanager/alertmanager.yml",
]
}
template {
source = "${scripts_dir}/alertmanager.yml.tpl"
destination = "local/alertmanager.yml"
change_mode = "restart"
}
resources {
memory = "${alertmanager_total_memory_size}"
network {
......@@ -47,6 +53,12 @@ job "asapo-monitoring" {
}
}
}
meta {
alert_email = "${asapo_alert_email}"
email_smart_host = "${asapo_alert_email_smart_host}"
}
service {
name = "alertmanager"
port = "alertmanager_ui"
......
......@@ -74,7 +74,7 @@ job "asapo-receivers" {
initial_status = "passing"
}
meta {
metrics-port = "${NOMAD_PORT_recv_metrics}"
metrics-port = "$${NOMAD_PORT_recv_metrics}"
}
%{ else }
check {
......
......@@ -16,6 +16,8 @@ asapo_monitor = true
asapo_monitor_alert = false
prometheus_version = "v2.30.3"
alertmanager_version = "v0.23.0"
asapo_alert_email = ""
asapo_alert_email_smart_host = ""
job_scripts_dir = "/var/run/asapo"
......
......@@ -22,10 +22,14 @@ rule_files:
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
consul_sd_configs:
- server: '{{ env "NOMAD_IP_prometheus_ui" }}:8500'
services:
- 'prometheus'
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: job
metrics_path: /prometheus/metrics
- job_name: "nomad metrics"
consul_sd_configs:
- server: '{{ env "NOMAD_IP_prometheus_ui" }}:8500'
......@@ -42,6 +46,14 @@ scrape_configs:
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: job
- job_name: broker
consul_sd_configs:
- server: '{{ env "NOMAD_IP_prometheus_ui" }}:8500'
services:
- 'asapo-broker'
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: job
- job_name: receiver
consul_sd_configs:
- server: '{{ env "NOMAD_IP_prometheus_ui" }}:8500'
......
......@@ -120,6 +120,8 @@ data "template_file" "asapo_monitoring" {
prometheus_total_memory_size = "${var.prometheus_total_memory_size}"
alertmanager_total_memory_size = "${var.alertmanager_total_memory_size}"
asapo_user = "${var.asapo_user}"
asapo_alert_email = "${var.asapo_alert_email}"
asapo_alert_email_smart_host = "${var.asapo_alert_email_smart_host}"
}
}
......
......@@ -6,6 +6,10 @@ variable "asapo_monitor" {}
variable "asapo_monitor_alert" {}
variable "asapo_alert_email" {}
variable "asapo_alert_email_smart_host" {}
variable "force_pull_images" {}
variable "asapo_user" {}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment