From 96e42200f05f575bb55b0a632ef6cabc12b4f319 Mon Sep 17 00:00:00 2001
From: Eric Cano <Eric.Cano@cern.ch>
Date: Thu, 26 Oct 2017 17:26:47 +0200
Subject: [PATCH] Switched garbage collector from dedicated watcher to all
 watch all strategy.

The previous strategy was made under the assumption that we needed to lock sparingly.
With the introduction of lockfree strategy, this is not true anymore.
The new strategy will be immune from the A watches B, B watches A, both die, and
no one garbage collects them situation (also called cyclers in utility cta-objectstore-unfollow-agent).
---
 objectstore/AgentWatchdog.hpp                 |   6 +-
 objectstore/GarbageCollector.cpp              | 230 ++++--------------
 objectstore/GarbageCollector.hpp              |   1 -
 tapeserver/daemon/GarbageCollectorHandler.cpp |   1 -
 4 files changed, 52 insertions(+), 186 deletions(-)

diff --git a/objectstore/AgentWatchdog.hpp b/objectstore/AgentWatchdog.hpp
index 875dd295bf..64d3f2f3be 100644
--- a/objectstore/AgentWatchdog.hpp
+++ b/objectstore/AgentWatchdog.hpp
@@ -26,8 +26,7 @@ class AgentWatchdog {
 public:
   AgentWatchdog(const std::string & name, Backend & os): m_agent(name, os), 
     m_heartbeatCounter(readHeartbeat()) {
-    ScopedSharedLock lock(m_agent);
-    m_agent.fetch();
+    m_agent.fetchNoLock();
     m_timeout = m_agent.getTimeout();
   }
   
@@ -58,8 +57,7 @@ private:
   double m_timeout;
   
   uint64_t readHeartbeat() {
-    ScopedSharedLock lock(m_agent);
-    m_agent.fetch();
+    m_agent.fetchNoLock();
     return m_agent.getHeartbeatCount();
   }
 };
diff --git a/objectstore/GarbageCollector.cpp b/objectstore/GarbageCollector.cpp
index 1f63aa25a1..355c5b782e 100644
--- a/objectstore/GarbageCollector.cpp
+++ b/objectstore/GarbageCollector.cpp
@@ -22,7 +22,6 @@
 #include <algorithm>
 
 namespace cta { namespace objectstore {
-const size_t GarbageCollector::c_maxWatchedAgentsPerGC = 25;
 
 GarbageCollector::GarbageCollector(Backend & os, AgentReference & agentReference, catalogue::Catalogue & catalogue): 
   m_objectStore(os), m_catalogue(catalogue), m_ourAgentReference(agentReference), m_agentRegister(os) {
@@ -42,105 +41,61 @@ void GarbageCollector::runOnePass(log::LogContext & lc) {
 }
   
 void GarbageCollector::trimGoneTargets(log::LogContext & lc) {
-  ScopedSharedLock arLock(m_agentRegister);
-  m_agentRegister.fetch();
-  arLock.release();
+  m_agentRegister.fetchNoLock();
   std::list<std::string> agentList = m_agentRegister.getAgents();
+  // Find the agents we knew about and are not listed anymore.
+  // We will just stop looking for them.
   for (std::map<std::string, AgentWatchdog * >::iterator wa
         = m_watchedAgents.begin();
       wa != m_watchedAgents.end();) {
     if (agentList.end() == std::find(agentList.begin(), agentList.end(), wa->first)) {
-      Agent ourAgent(m_ourAgentReference.getAgentAddress(), m_objectStore);
-      ScopedExclusiveLock oaLock(ourAgent);
-      ourAgent.fetch();
-      ourAgent.removeFromOwnership(wa->first);
-      std::string removedAgent = wa->first;
-      ourAgent.commit();
-      oaLock.release();
       delete wa->second;
       m_watchedAgents.erase(wa++);
       log::ScopedParamContainer params(lc);
-      params.add("agentAddress", removedAgent);
+      params.add("agentAddress", wa->first);
       lc.log(log::INFO, "In GarbageCollector::trimGoneTargets(): removed now gone agent.");
     } else {
       wa++;
     }
   }
 }
-  
+
 void GarbageCollector::aquireTargets(log::LogContext & lc) {
-  ScopedExclusiveLock arLock(m_agentRegister);
-  m_agentRegister.fetch();
-  // Get the list of untracked agents
+  m_agentRegister.fetchNoLock();
+  // We will now watch all agents we do not know about yet.
   std::list<std::string> candidatesList = m_agentRegister.getUntrackedAgents();
-  std::list<std::string>::const_iterator c = candidatesList.begin();
-  // We can now take ownership of new agents, up to our max...
-  // and we don't monitor ourselves!
-  for (;m_watchedAgents.size() < c_maxWatchedAgentsPerGC
-         && c!=candidatesList.end(); c++) {
+  // Build a set of our own tracked agents.
+  std::set<std::string> alreadyTrackedAgents;
+  for (auto &ata: m_watchedAgents) {
+    alreadyTrackedAgents.insert(ata.first);
+  }
+  for (auto &c: candidatesList) {
     // We don't monitor ourselves
-    if (*c != m_ourAgentReference.getAgentAddress()) {
+    if (c != m_ourAgentReference.getAgentAddress() && !alreadyTrackedAgents.count(c)) {
       // So we have a candidate we might want to monitor
       // First, check that the agent entry exists, and that ownership
       // is indeed pointing to the agent register
-      Agent ag(*c, m_objectStore);
-      ScopedExclusiveLock agLock;
-      Agent ourAgent(m_ourAgentReference.getAgentAddress(), m_objectStore);
-      ScopedExclusiveLock oaLock;
+      Agent ag(c, m_objectStore);
       try {
-        if (!ag.exists()) {
-          // This is a dangling pointer to a dead object:
-          // remove it in the agentRegister.
-          m_agentRegister.removeAgent(*c);
-          continue;
-        }
-        agLock.lock(ag);
-        ag.fetch();
-        // Check that the actual owner is the agent register.
-        // otherwise, it should not be listed as an agent to monitor
-        if (ag.getOwner() != m_agentRegister.getAddressIfSet()) {
-          m_agentRegister.trackAgent(ag.getAddressIfSet());
-          agLock.release();
-          continue;
-        }
-        // We are now interested in tracking this agent. So we will transfer its
-        // ownership. We alredy have an exclusive lock on the agent.
-        // Lock ours
-        
-        oaLock.lock(ourAgent);
-        ourAgent.fetch();
-        ourAgent.addToOwnership(ag.getAddressIfSet());
-        ourAgent.commit();
-        // We now have a pointer to the agent, we can make the ownership official
-        ag.setOwner(ourAgent.getAddressIfSet());
-        ag.commit();
-      } catch (cta::exception::Exception & ex) {
-        // We received an exception. This can happen is the agent disappears under our feet.
-        // This is fine, we just let go this time, and trimGoneTargets() will just de-reference
-        // it later. But if the object is present, we have a problem.
-        if (m_objectStore.exists(*c)) throw;
+        ag.fetchNoLock();
+      } catch (...) {
+        // The agent could simply be gone... (If not, let the complain go through).
+        if (m_objectStore.exists(c)) throw;
+        continue;
+      }
+      if (ag.getOwner() == m_agentRegister.getAddressIfSet()) {
       }
       log::ScopedParamContainer params(lc);
       params.add("agentAddress", ag.getAddressIfSet())
-            .add("gcAgentAddress", ourAgent.getAddressIfSet());
+            .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
       lc.log(log::INFO, "In GarbageCollector::aquireTargets(): started tracking an untracked agent");
-      // Agent is officially ours, we can remove it from the untracked agent's
-      // list
-      m_agentRegister.trackAgent(ag.getAddressIfSet());
-      m_agentRegister.commit();
-      // Agent is now officially ours, let's track it. We have the release the 
-      // lock to the agent before constructing the watchdog, which builds
-      // its own agent objects (and need to lock the object store representation)
-      std::string agentName = ag.getAddressIfSet();
+      // Agent is to be tracked, let's track it.
       double timeout=ag.getTimeout();
-      agLock.release();      
-      m_watchedAgents[agentName] =
-        new AgentWatchdog(agentName, m_objectStore);
-      m_watchedAgents[ag.getAddressIfSet()]->setTimeout(timeout);
+      m_watchedAgents[c] =
+        new AgentWatchdog(c, m_objectStore);
+      m_watchedAgents[c]->setTimeout(timeout);
     }
   }
-  // Commit all the modifications to the agent register
-  m_agentRegister.commit();
 }
  
 void GarbageCollector::checkHeartbeats(log::LogContext & lc) {
@@ -152,11 +107,6 @@ void GarbageCollector::checkHeartbeats(log::LogContext & lc) {
     try {
       if (!wa->second->checkAlive()) {
         cleanupDeadAgent(wa->first, lc);
-        Agent ourAgent(m_ourAgentReference.getAgentAddress(), m_objectStore);
-        ScopedExclusiveLock oaLock(ourAgent);
-        ourAgent.fetch();
-        ourAgent.removeFromOwnership(wa->first);
-        ourAgent.commit();
         delete wa->second;
         m_watchedAgents.erase(wa++);
       } else {
@@ -175,19 +125,33 @@ void GarbageCollector::checkHeartbeats(log::LogContext & lc) {
 }
 
 void GarbageCollector::cleanupDeadAgent(const std::string & address, log::LogContext & lc) {
-  // Check that we are still owners of the agent (sanity check).
+  // We detected a dead agent. Try and take ownership of it. It could already be owned
+  // by another garbage collector.
+  // To minimize locking, take a lock on the agent and check its ownership first.
+  // We do not need to be defensive about exception here as calling function will
+  // deal with them.
   Agent agent(address, m_objectStore);
   ScopedExclusiveLock agLock(agent);
   agent.fetch();
   log::ScopedParamContainer params(lc);
   params.add("agentAddress", agent.getAddressIfSet())
-       .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
-  if (agent.getOwner() != m_ourAgentReference.getAgentAddress()) {
-   log::ScopedParamContainer params(lc);
-   lc.log(log::WARNING, "In GarbageCollector::cleanupDeadAgent(): skipping agent which is not owned by this garbage collector as thought.");
-   // The agent is removed from our ownership by the calling function: we're done.
-   return;
+        .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
+  if (agent.getOwner() != m_agentRegister.getAddressIfSet()) {
+    params.add("agentOwner", agent.getOwner());
+    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): skipping agent which is not owned by agent register anymore.");
+    // The agent will be removed from our ownership by the calling function: we're done.
+    return;
   }
+  // Aquire ownership of the agent.
+  m_ourAgentReference.addToOwnership(address,m_objectStore);
+  agent.setOwner(m_ourAgentReference.getAgentAddress());
+  agent.commit();
+  // Update the register
+  ScopedExclusiveLock arl(m_agentRegister);
+  m_agentRegister.fetch();
+  m_agentRegister.trackAgent(address);
+  m_agentRegister.commit();
+  arl.release();
   lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): will cleanup dead agent.");
   // Return all objects owned by the agent to their respective backup owners
   auto ownedObjects = agent.getOwnershipList();
@@ -214,102 +178,8 @@ void GarbageCollector::cleanupDeadAgent(const std::string & address, log::LogCon
   // We now processed all the owned objects. We can delete the agent's entry
   agent.removeAndUnregisterSelf();
   lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): agent entry removed.");
+  // We can remove the agent from our own ownership.
+  m_ourAgentReference.removeFromOwnership(address, m_objectStore);
 }
 
-void GarbageCollector::reinjectOwnedObject(log::LogContext& lc) {
-  // We have to release the agents we were following. PErformance is not an issue, so
-  // we go in small steps.
-  // First check the agents are indeed owned by us and still exist.
-  std::list<std::string> stillTrackedAgents;
-  std::list<std::string> goneAgents;
-  std::list<std::string> notReallyOwnedAgents;
-  std::list<std::string> inaccessibleAgents;
-  {
-    auto a = m_watchedAgents.begin();
-    while(a!=m_watchedAgents.end()) {
-      auto & agentAddress=a->first;
-      log::ScopedParamContainer params(lc);
-      params.add("agentAddress", agentAddress);
-      // Check the agent is there, and ours.
-      if (!m_objectStore.exists(agentAddress)) {
-        goneAgents.emplace_back(agentAddress);
-        lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): agent not present anymore.");
-      } else {
-        try {
-          Agent ag(agentAddress, m_objectStore);
-          ScopedSharedLock agl(ag);
-          ag.fetch();
-          if (ag.getOwner() == m_ourAgentReference.getAgentAddress()) {
-            stillTrackedAgents.emplace_back(agentAddress);
-            lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): agent still owned by us.");
-          } else {
-            params.add("currentOwner", ag.getOwner());
-            notReallyOwnedAgents.emplace_back(agentAddress);
-            lc.log(log::ERR, "In GarbageCollector::reinjectOwnedObject(): agent not owned by us.");
-          }
-        } catch (cta::exception::Exception & ex) {
-          params.add("ExceptionMessage", ex.getMessageValue());
-          lc.log(log::ERR, "In GarbageCollector::reinjectOwnedObject(): agent inaccessible.");
-          inaccessibleAgents.emplace_back(a->first);
-        }
-      }
-      a=m_watchedAgents.erase(a);
-    }
-  }
-  {
-    // We now have an overview of the situation. We can update the agent register based on that.
-    ScopedExclusiveLock arLock(m_agentRegister);
-    m_agentRegister.fetch();
-    for (auto &sta: stillTrackedAgents) {
-      m_agentRegister.untrackAgent(sta);
-      log::ScopedParamContainer params(lc);
-      params.add("agentAddress", sta);
-      lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): untracked agent in registry.");
-    }
-    for (auto &ga: goneAgents) {
-      m_agentRegister.removeAgent(ga);
-      log::ScopedParamContainer params(lc);
-      params.add("agentAddress", ga);
-      lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): removed gone agent from registry.");
-    }
-    // This is all we are going to do. Other agents cannot be acted upon.
-    m_agentRegister.commit();
-  }
-  // We can now remove ownership from the agents we still owned
-  for (auto & sta: stillTrackedAgents) {
-    log::ScopedParamContainer params(lc);
-    params.add("agentAddress", sta);
-    Agent ag (sta, m_objectStore);
-    ScopedExclusiveLock agl(ag);
-    ag.fetch();
-    if (ag.getOwner() == m_ourAgentReference.getAgentAddress()) {
-      ag.setOwner(m_agentRegister.getAddressIfSet());
-      ag.commit();
-      lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): changed agent ownership to registry.");
-    } else {
-      params.add("newOwner", ag.getOwner());
-      lc.log(log::ERR, "In GarbageCollector::reinjectOwnedObject(): skipping agent whose ownership we lost last minute.");
-    }
-  }
-  // We can now cleanup our own agent and remove it.
-  Agent ourAg(m_ourAgentReference.getAgentAddress(), m_objectStore);
-  ScopedExclusiveLock ourAgL(ourAg);
-  ourAg.fetch();
-  std::list<std::string> allAgents;
-  allAgents.splice(allAgents.end(), stillTrackedAgents);
-  allAgents.splice(allAgents.end(), notReallyOwnedAgents);
-  allAgents.splice(allAgents.end(), inaccessibleAgents);
-  allAgents.splice(allAgents.end(), goneAgents);
-  for (auto & a: allAgents) {
-    log::ScopedParamContainer params(lc);
-    params.add("agentAddress", a);
-    ourAg.removeFromOwnership(a);
-    lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): removed agent from our ownership.");
-  }
-  ourAg.commit();
-}
-
-
-
-
 }}
diff --git a/objectstore/GarbageCollector.hpp b/objectstore/GarbageCollector.hpp
index beea0ec4a7..37706e4a71 100644
--- a/objectstore/GarbageCollector.hpp
+++ b/objectstore/GarbageCollector.hpp
@@ -55,7 +55,6 @@ private:
   AgentReference & m_ourAgentReference;
   AgentRegister m_agentRegister;
   std::map<std::string, AgentWatchdog * > m_watchedAgents;
-  static const size_t c_maxWatchedAgentsPerGC;
 };
   
 }}
\ No newline at end of file
diff --git a/tapeserver/daemon/GarbageCollectorHandler.cpp b/tapeserver/daemon/GarbageCollectorHandler.cpp
index 51910179d9..024387697c 100644
--- a/tapeserver/daemon/GarbageCollectorHandler.cpp
+++ b/tapeserver/daemon/GarbageCollectorHandler.cpp
@@ -313,7 +313,6 @@ int GarbageCollectorHandler::runChild() {
         receivedMessage=true;
       } catch (server::SocketPair::Timeout & ex) {}
     } while (!receivedMessage);
-    gc.reinjectOwnedObject(m_processManager.logContext());
     m_processManager.logContext().log(log::INFO,
         "In GarbageCollectorHandler::runChild(): Received shutdown message. Exiting.");
   } catch (cta::exception::Exception & ex) {
-- 
GitLab