From 2a03a415b66455cfacbc3b1a749a28ae635b770e Mon Sep 17 00:00:00 2001
From: Tim Schoof <tim.schoof@desy.de>
Date: Tue, 11 Feb 2020 12:02:17 +0100
Subject: [PATCH] Improve telegraf health check

The example values from
https://github.com/influxdata/telegraf/tree/master/plugins/outputs/health are
used. The internal metrics will also be output to the influxdb, but at the
current low ingest rate this should not be a problem.
---
 .../scripts/monitoring.nmd.tpl                   |  8 +++++---
 .../scripts/telegraf.conf.tpl                    | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/deploy/asapo_orchestration_docker/scripts/monitoring.nmd.tpl b/deploy/asapo_orchestration_docker/scripts/monitoring.nmd.tpl
index 3ab354c40..e0282a8db 100644
--- a/deploy/asapo_orchestration_docker/scripts/monitoring.nmd.tpl
+++ b/deploy/asapo_orchestration_docker/scripts/monitoring.nmd.tpl
@@ -159,6 +159,8 @@ job "monitoring" {
         network {
           port "telegraf_stream" {
           }
+          port "telegraf_health" {
+          }
         }
       }
 
@@ -167,9 +169,9 @@ job "monitoring" {
         port = "telegraf_stream"
         check {
           name     = "telegraf-alive"
-          type     = "script"
-          command  = "/bin/pidof"
-          args     = ["telegraf"]
+          type     = "http"
+          path     = "/"
+          port     = "telegraf_health"
           interval = "10s"
           timeout  = "2s"
         }
diff --git a/deploy/asapo_orchestration_docker/scripts/telegraf.conf.tpl b/deploy/asapo_orchestration_docker/scripts/telegraf.conf.tpl
index e962ebe74..13e65266d 100644
--- a/deploy/asapo_orchestration_docker/scripts/telegraf.conf.tpl
+++ b/deploy/asapo_orchestration_docker/scripts/telegraf.conf.tpl
@@ -9,6 +9,9 @@
 
 [[inputs.consul]]
 
+[[inputs.internal]]
+  collect_memstats = false
+
 [[outputs.file]]
 	files=["stdout"]
 
@@ -16,3 +19,16 @@
 [[outputs.influxdb]]
     urls = ["http://localhost:{{ env "NOMAD_META_nginx_port" }}/influxdb"]
 
+
+[[outputs.health]]
+  service_address = "http://{{ env "NOMAD_ADDR_telegraf_health" }}"
+
+  namepass = ["internal_write"]
+  tagpass = { output = ["influxdb"] }
+
+  [[outputs.health.compares]]
+    field = "buffer_size"
+    lt = 5000.0
+
+  [[outputs.health.contains]]
+    field = "buffer_size"
-- 
GitLab