From 6e5503abf1344a3a7010ef3030bfd27b8793b0dc Mon Sep 17 00:00:00 2001
From: Eric Cano <Eric.Cano@cern.ch>
Date: Fri, 2 Aug 2019 09:25:36 +0200
Subject: [PATCH] Added flag for immediate garbage collection of agents.

The process can now indicate the agent structure can be garbage collected, saving a wait for a timeout. This
is typically hapenning when archive jobs need requeueing after a session hits the end of the tape.
---
 objectstore/Agent.cpp            | 12 ++++++++++++
 objectstore/Agent.hpp            |  4 ++++
 objectstore/AgentWatchdog.hpp    | 32 ++++++++++++++++++++++----------
 objectstore/BackendPopulator.cpp |  2 ++
 objectstore/cta.proto            |  1 +
 5 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/objectstore/Agent.cpp b/objectstore/Agent.cpp
index 89cbe61546..4e6c3e91ef 100644
--- a/objectstore/Agent.cpp
+++ b/objectstore/Agent.cpp
@@ -45,6 +45,7 @@ void cta::objectstore::Agent::initialize() {
   m_payload.set_timeout_us(120*1000*1000);
   m_payload.set_description("");
   m_payload.set_being_garbage_collected(false);
+  m_payload.set_gc_needed(false);
   m_payloadInterpreted = true;
 }
 
@@ -149,6 +150,17 @@ void cta::objectstore::Agent::setBeingGarbageCollected() {
   m_payload.set_being_garbage_collected(true);
 }
 
+void cta::objectstore::Agent::setNeedsGarbageCollection() {
+  checkPayloadWritable();
+  m_payload.set_gc_needed(true);
+}
+
+bool cta::objectstore::Agent::needsGarbageCollection() {
+  checkPayloadReadable();
+  if (!m_payload.has_gc_needed()) return false;
+  return m_payload.gc_needed();
+}
+
 void cta::objectstore::Agent::garbageCollect(const std::string& presumedOwner, AgentReference & agentReference, log::LogContext & lc,
     cta::catalogue::Catalogue & catalogue) {
   checkPayloadWritable();
diff --git a/objectstore/Agent.hpp b/objectstore/Agent.hpp
index da25916118..de371d1134 100644
--- a/objectstore/Agent.hpp
+++ b/objectstore/Agent.hpp
@@ -62,6 +62,10 @@ public:
   
   bool isBeingGarbageCollected();
   
+  void setNeedsGarbageCollection();
+  
+  bool needsGarbageCollection();
+  
   void garbageCollect(const std::string &presumedOwner, AgentReference & agentReference, log::LogContext & lc,
     cta::catalogue::Catalogue & catalogue) override;
   
diff --git a/objectstore/AgentWatchdog.hpp b/objectstore/AgentWatchdog.hpp
index 9f1a157a19..0ae9af672b 100644
--- a/objectstore/AgentWatchdog.hpp
+++ b/objectstore/AgentWatchdog.hpp
@@ -25,25 +25,28 @@ namespace cta { namespace objectstore {
 class AgentWatchdog {
 public:
   AgentWatchdog(const std::string & name, Backend & os): m_agent(name, os), 
-    m_heartbeatCounter(readHeartbeat()) {
+    m_heartbeatCounter(readGCData().heartbeat) {
     m_agent.fetchNoLock();
     m_timeout = m_agent.getTimeout();
   }
   
   bool checkAlive() {
-    uint64_t newHeartBeatCount;
-    try { 
-      newHeartBeatCount = readHeartbeat();
+    struct gcData newGCData;
+    try {
+      newGCData = readGCData();
     } catch (Backend::NoSuchObject &) {
       // The agent could be gone. This is not an error. Mark it as alive,
       // and will be trimmed later.
       return true;
     }        
     auto timer = m_timer.secs();
-    if (newHeartBeatCount == m_heartbeatCounter && timer > m_timeout)
+    // If GC is required, it's easy...
+    if (newGCData.needsGC) return false;
+    // If heartbeat has not moved for more than the timeout, we declare the agent dead.
+    if (newGCData.heartbeat == m_heartbeatCounter && timer > m_timeout)
       return false;
-    if (newHeartBeatCount != m_heartbeatCounter) {
-      m_heartbeatCounter = newHeartBeatCount;
+    if (newGCData.heartbeat != m_heartbeatCounter) {
+      m_heartbeatCounter = newGCData.heartbeat;
       m_timer.reset();
     }
     return true; 
@@ -51,7 +54,9 @@ public:
   
   std::list<log::Param> getDeadAgentDetails() {
     std::list<log::Param> ret;
-    ret.push_back(log::Param("currentHeartbeat", readHeartbeat()));
+    auto gcData = readGCData();
+    ret.push_back(log::Param("currentHeartbeat", gcData.heartbeat));
+    ret.push_back(log::Param("GCRequested", gcData.needsGC?"true":"false"));
     ret.push_back(log::Param("timeout", m_timeout));
     ret.push_back(log::Param("timer", m_timer.secs()));
     ret.push_back(log::Param("heartbeatAtTimerStart", m_heartbeatCounter));
@@ -72,9 +77,16 @@ private:
   uint64_t m_heartbeatCounter;
   double m_timeout;
   
-  uint64_t readHeartbeat() {
+  struct gcData {
+    uint64_t heartbeat;
+    bool needsGC;
+  };
+  struct gcData readGCData() {
     m_agent.fetchNoLock();
-    return m_agent.getHeartbeatCount();
+    struct gcData ret;
+    ret.heartbeat = m_agent.getHeartbeatCount();
+    ret.needsGC = m_agent.needsGarbageCollection();
+    return ret;
   }
 };
     
diff --git a/objectstore/BackendPopulator.cpp b/objectstore/BackendPopulator.cpp
index 3a7f71a1af..7c6d84ceb5 100644
--- a/objectstore/BackendPopulator.cpp
+++ b/objectstore/BackendPopulator.cpp
@@ -66,6 +66,8 @@ BackendPopulator::~BackendPopulator() throw() {
     agent.fetch();
     if (m_leaveNonEmptyAgentBehind && !agent.isEmpty()) {
       cta::log::ScopedParamContainer params(m_lc);
+      agent.setNeedsGarbageCollection();
+      agent.commit();
       params.add("agentObject", agent.getAddressIfSet())
             .add("ownedObjectCount", agent.getOwnershipList().size());
       m_lc.log(log::WARNING, "In BackendPopulator::~BackendPopulator(): not deleting non-empty agent object, left for garbage collection.");
diff --git a/objectstore/cta.proto b/objectstore/cta.proto
index 0ef553d934..c3797e3a5c 100644
--- a/objectstore/cta.proto
+++ b/objectstore/cta.proto
@@ -157,6 +157,7 @@ message Agent {
   required uint64 timeout_us = 2002;
   repeated string ownedobjects = 2003;
   optional bool being_garbage_collected = 2004 [default = false];
+  optional bool gc_needed = 2005 [default = false];
 }
 
 message AgentRegister {
-- 
GitLab