From 6e5503abf1344a3a7010ef3030bfd27b8793b0dc Mon Sep 17 00:00:00 2001 From: Eric Cano <Eric.Cano@cern.ch> Date: Fri, 2 Aug 2019 09:25:36 +0200 Subject: [PATCH] Added flag for immediate garbage collection of agents. The process can now indicate the agent structure can be garbage collected, saving a wait for a timeout. This is typically hapenning when archive jobs need requeueing after a session hits the end of the tape. --- objectstore/Agent.cpp | 12 ++++++++++++ objectstore/Agent.hpp | 4 ++++ objectstore/AgentWatchdog.hpp | 32 ++++++++++++++++++++++---------- objectstore/BackendPopulator.cpp | 2 ++ objectstore/cta.proto | 1 + 5 files changed, 41 insertions(+), 10 deletions(-) diff --git a/objectstore/Agent.cpp b/objectstore/Agent.cpp index 89cbe61546..4e6c3e91ef 100644 --- a/objectstore/Agent.cpp +++ b/objectstore/Agent.cpp @@ -45,6 +45,7 @@ void cta::objectstore::Agent::initialize() { m_payload.set_timeout_us(120*1000*1000); m_payload.set_description(""); m_payload.set_being_garbage_collected(false); + m_payload.set_gc_needed(false); m_payloadInterpreted = true; } @@ -149,6 +150,17 @@ void cta::objectstore::Agent::setBeingGarbageCollected() { m_payload.set_being_garbage_collected(true); } +void cta::objectstore::Agent::setNeedsGarbageCollection() { + checkPayloadWritable(); + m_payload.set_gc_needed(true); +} + +bool cta::objectstore::Agent::needsGarbageCollection() { + checkPayloadReadable(); + if (!m_payload.has_gc_needed()) return false; + return m_payload.gc_needed(); +} + void cta::objectstore::Agent::garbageCollect(const std::string& presumedOwner, AgentReference & agentReference, log::LogContext & lc, cta::catalogue::Catalogue & catalogue) { checkPayloadWritable(); diff --git a/objectstore/Agent.hpp b/objectstore/Agent.hpp index da25916118..de371d1134 100644 --- a/objectstore/Agent.hpp +++ b/objectstore/Agent.hpp @@ -62,6 +62,10 @@ public: bool isBeingGarbageCollected(); + void setNeedsGarbageCollection(); + + bool needsGarbageCollection(); + void garbageCollect(const std::string &presumedOwner, AgentReference & agentReference, log::LogContext & lc, cta::catalogue::Catalogue & catalogue) override; diff --git a/objectstore/AgentWatchdog.hpp b/objectstore/AgentWatchdog.hpp index 9f1a157a19..0ae9af672b 100644 --- a/objectstore/AgentWatchdog.hpp +++ b/objectstore/AgentWatchdog.hpp @@ -25,25 +25,28 @@ namespace cta { namespace objectstore { class AgentWatchdog { public: AgentWatchdog(const std::string & name, Backend & os): m_agent(name, os), - m_heartbeatCounter(readHeartbeat()) { + m_heartbeatCounter(readGCData().heartbeat) { m_agent.fetchNoLock(); m_timeout = m_agent.getTimeout(); } bool checkAlive() { - uint64_t newHeartBeatCount; - try { - newHeartBeatCount = readHeartbeat(); + struct gcData newGCData; + try { + newGCData = readGCData(); } catch (Backend::NoSuchObject &) { // The agent could be gone. This is not an error. Mark it as alive, // and will be trimmed later. return true; } auto timer = m_timer.secs(); - if (newHeartBeatCount == m_heartbeatCounter && timer > m_timeout) + // If GC is required, it's easy... + if (newGCData.needsGC) return false; + // If heartbeat has not moved for more than the timeout, we declare the agent dead. + if (newGCData.heartbeat == m_heartbeatCounter && timer > m_timeout) return false; - if (newHeartBeatCount != m_heartbeatCounter) { - m_heartbeatCounter = newHeartBeatCount; + if (newGCData.heartbeat != m_heartbeatCounter) { + m_heartbeatCounter = newGCData.heartbeat; m_timer.reset(); } return true; @@ -51,7 +54,9 @@ public: std::list<log::Param> getDeadAgentDetails() { std::list<log::Param> ret; - ret.push_back(log::Param("currentHeartbeat", readHeartbeat())); + auto gcData = readGCData(); + ret.push_back(log::Param("currentHeartbeat", gcData.heartbeat)); + ret.push_back(log::Param("GCRequested", gcData.needsGC?"true":"false")); ret.push_back(log::Param("timeout", m_timeout)); ret.push_back(log::Param("timer", m_timer.secs())); ret.push_back(log::Param("heartbeatAtTimerStart", m_heartbeatCounter)); @@ -72,9 +77,16 @@ private: uint64_t m_heartbeatCounter; double m_timeout; - uint64_t readHeartbeat() { + struct gcData { + uint64_t heartbeat; + bool needsGC; + }; + struct gcData readGCData() { m_agent.fetchNoLock(); - return m_agent.getHeartbeatCount(); + struct gcData ret; + ret.heartbeat = m_agent.getHeartbeatCount(); + ret.needsGC = m_agent.needsGarbageCollection(); + return ret; } }; diff --git a/objectstore/BackendPopulator.cpp b/objectstore/BackendPopulator.cpp index 3a7f71a1af..7c6d84ceb5 100644 --- a/objectstore/BackendPopulator.cpp +++ b/objectstore/BackendPopulator.cpp @@ -66,6 +66,8 @@ BackendPopulator::~BackendPopulator() throw() { agent.fetch(); if (m_leaveNonEmptyAgentBehind && !agent.isEmpty()) { cta::log::ScopedParamContainer params(m_lc); + agent.setNeedsGarbageCollection(); + agent.commit(); params.add("agentObject", agent.getAddressIfSet()) .add("ownedObjectCount", agent.getOwnershipList().size()); m_lc.log(log::WARNING, "In BackendPopulator::~BackendPopulator(): not deleting non-empty agent object, left for garbage collection."); diff --git a/objectstore/cta.proto b/objectstore/cta.proto index 0ef553d934..c3797e3a5c 100644 --- a/objectstore/cta.proto +++ b/objectstore/cta.proto @@ -157,6 +157,7 @@ message Agent { required uint64 timeout_us = 2002; repeated string ownedobjects = 2003; optional bool being_garbage_collected = 2004 [default = false]; + optional bool gc_needed = 2005 [default = false]; } message AgentRegister { -- GitLab