diff --git a/objectstore/AgentWatchdog.hpp b/objectstore/AgentWatchdog.hpp index 875dd295bf24eba35ea7e122951fde5e5b49568b..64d3f2f3be940e549669de0d81cc3a5fbfa4a4f9 100644 --- a/objectstore/AgentWatchdog.hpp +++ b/objectstore/AgentWatchdog.hpp @@ -26,8 +26,7 @@ class AgentWatchdog { public: AgentWatchdog(const std::string & name, Backend & os): m_agent(name, os), m_heartbeatCounter(readHeartbeat()) { - ScopedSharedLock lock(m_agent); - m_agent.fetch(); + m_agent.fetchNoLock(); m_timeout = m_agent.getTimeout(); } @@ -58,8 +57,7 @@ private: double m_timeout; uint64_t readHeartbeat() { - ScopedSharedLock lock(m_agent); - m_agent.fetch(); + m_agent.fetchNoLock(); return m_agent.getHeartbeatCount(); } }; diff --git a/objectstore/GarbageCollector.cpp b/objectstore/GarbageCollector.cpp index 1f63aa25a1f093b1e1838221b5f63413bcd3c3af..355c5b782e26c07ca171aeb68bf16ede6dfe8245 100644 --- a/objectstore/GarbageCollector.cpp +++ b/objectstore/GarbageCollector.cpp @@ -22,7 +22,6 @@ #include <algorithm> namespace cta { namespace objectstore { -const size_t GarbageCollector::c_maxWatchedAgentsPerGC = 25; GarbageCollector::GarbageCollector(Backend & os, AgentReference & agentReference, catalogue::Catalogue & catalogue): m_objectStore(os), m_catalogue(catalogue), m_ourAgentReference(agentReference), m_agentRegister(os) { @@ -42,105 +41,61 @@ void GarbageCollector::runOnePass(log::LogContext & lc) { } void GarbageCollector::trimGoneTargets(log::LogContext & lc) { - ScopedSharedLock arLock(m_agentRegister); - m_agentRegister.fetch(); - arLock.release(); + m_agentRegister.fetchNoLock(); std::list<std::string> agentList = m_agentRegister.getAgents(); + // Find the agents we knew about and are not listed anymore. + // We will just stop looking for them. for (std::map<std::string, AgentWatchdog * >::iterator wa = m_watchedAgents.begin(); wa != m_watchedAgents.end();) { if (agentList.end() == std::find(agentList.begin(), agentList.end(), wa->first)) { - Agent ourAgent(m_ourAgentReference.getAgentAddress(), m_objectStore); - ScopedExclusiveLock oaLock(ourAgent); - ourAgent.fetch(); - ourAgent.removeFromOwnership(wa->first); - std::string removedAgent = wa->first; - ourAgent.commit(); - oaLock.release(); delete wa->second; m_watchedAgents.erase(wa++); log::ScopedParamContainer params(lc); - params.add("agentAddress", removedAgent); + params.add("agentAddress", wa->first); lc.log(log::INFO, "In GarbageCollector::trimGoneTargets(): removed now gone agent."); } else { wa++; } } } - + void GarbageCollector::aquireTargets(log::LogContext & lc) { - ScopedExclusiveLock arLock(m_agentRegister); - m_agentRegister.fetch(); - // Get the list of untracked agents + m_agentRegister.fetchNoLock(); + // We will now watch all agents we do not know about yet. std::list<std::string> candidatesList = m_agentRegister.getUntrackedAgents(); - std::list<std::string>::const_iterator c = candidatesList.begin(); - // We can now take ownership of new agents, up to our max... - // and we don't monitor ourselves! - for (;m_watchedAgents.size() < c_maxWatchedAgentsPerGC - && c!=candidatesList.end(); c++) { + // Build a set of our own tracked agents. + std::set<std::string> alreadyTrackedAgents; + for (auto &ata: m_watchedAgents) { + alreadyTrackedAgents.insert(ata.first); + } + for (auto &c: candidatesList) { // We don't monitor ourselves - if (*c != m_ourAgentReference.getAgentAddress()) { + if (c != m_ourAgentReference.getAgentAddress() && !alreadyTrackedAgents.count(c)) { // So we have a candidate we might want to monitor // First, check that the agent entry exists, and that ownership // is indeed pointing to the agent register - Agent ag(*c, m_objectStore); - ScopedExclusiveLock agLock; - Agent ourAgent(m_ourAgentReference.getAgentAddress(), m_objectStore); - ScopedExclusiveLock oaLock; + Agent ag(c, m_objectStore); try { - if (!ag.exists()) { - // This is a dangling pointer to a dead object: - // remove it in the agentRegister. - m_agentRegister.removeAgent(*c); - continue; - } - agLock.lock(ag); - ag.fetch(); - // Check that the actual owner is the agent register. - // otherwise, it should not be listed as an agent to monitor - if (ag.getOwner() != m_agentRegister.getAddressIfSet()) { - m_agentRegister.trackAgent(ag.getAddressIfSet()); - agLock.release(); - continue; - } - // We are now interested in tracking this agent. So we will transfer its - // ownership. We alredy have an exclusive lock on the agent. - // Lock ours - - oaLock.lock(ourAgent); - ourAgent.fetch(); - ourAgent.addToOwnership(ag.getAddressIfSet()); - ourAgent.commit(); - // We now have a pointer to the agent, we can make the ownership official - ag.setOwner(ourAgent.getAddressIfSet()); - ag.commit(); - } catch (cta::exception::Exception & ex) { - // We received an exception. This can happen is the agent disappears under our feet. - // This is fine, we just let go this time, and trimGoneTargets() will just de-reference - // it later. But if the object is present, we have a problem. - if (m_objectStore.exists(*c)) throw; + ag.fetchNoLock(); + } catch (...) { + // The agent could simply be gone... (If not, let the complain go through). + if (m_objectStore.exists(c)) throw; + continue; + } + if (ag.getOwner() == m_agentRegister.getAddressIfSet()) { } log::ScopedParamContainer params(lc); params.add("agentAddress", ag.getAddressIfSet()) - .add("gcAgentAddress", ourAgent.getAddressIfSet()); + .add("gcAgentAddress", m_ourAgentReference.getAgentAddress()); lc.log(log::INFO, "In GarbageCollector::aquireTargets(): started tracking an untracked agent"); - // Agent is officially ours, we can remove it from the untracked agent's - // list - m_agentRegister.trackAgent(ag.getAddressIfSet()); - m_agentRegister.commit(); - // Agent is now officially ours, let's track it. We have the release the - // lock to the agent before constructing the watchdog, which builds - // its own agent objects (and need to lock the object store representation) - std::string agentName = ag.getAddressIfSet(); + // Agent is to be tracked, let's track it. double timeout=ag.getTimeout(); - agLock.release(); - m_watchedAgents[agentName] = - new AgentWatchdog(agentName, m_objectStore); - m_watchedAgents[ag.getAddressIfSet()]->setTimeout(timeout); + m_watchedAgents[c] = + new AgentWatchdog(c, m_objectStore); + m_watchedAgents[c]->setTimeout(timeout); } } - // Commit all the modifications to the agent register - m_agentRegister.commit(); } void GarbageCollector::checkHeartbeats(log::LogContext & lc) { @@ -152,11 +107,6 @@ void GarbageCollector::checkHeartbeats(log::LogContext & lc) { try { if (!wa->second->checkAlive()) { cleanupDeadAgent(wa->first, lc); - Agent ourAgent(m_ourAgentReference.getAgentAddress(), m_objectStore); - ScopedExclusiveLock oaLock(ourAgent); - ourAgent.fetch(); - ourAgent.removeFromOwnership(wa->first); - ourAgent.commit(); delete wa->second; m_watchedAgents.erase(wa++); } else { @@ -175,19 +125,33 @@ void GarbageCollector::checkHeartbeats(log::LogContext & lc) { } void GarbageCollector::cleanupDeadAgent(const std::string & address, log::LogContext & lc) { - // Check that we are still owners of the agent (sanity check). + // We detected a dead agent. Try and take ownership of it. It could already be owned + // by another garbage collector. + // To minimize locking, take a lock on the agent and check its ownership first. + // We do not need to be defensive about exception here as calling function will + // deal with them. Agent agent(address, m_objectStore); ScopedExclusiveLock agLock(agent); agent.fetch(); log::ScopedParamContainer params(lc); params.add("agentAddress", agent.getAddressIfSet()) - .add("gcAgentAddress", m_ourAgentReference.getAgentAddress()); - if (agent.getOwner() != m_ourAgentReference.getAgentAddress()) { - log::ScopedParamContainer params(lc); - lc.log(log::WARNING, "In GarbageCollector::cleanupDeadAgent(): skipping agent which is not owned by this garbage collector as thought."); - // The agent is removed from our ownership by the calling function: we're done. - return; + .add("gcAgentAddress", m_ourAgentReference.getAgentAddress()); + if (agent.getOwner() != m_agentRegister.getAddressIfSet()) { + params.add("agentOwner", agent.getOwner()); + lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): skipping agent which is not owned by agent register anymore."); + // The agent will be removed from our ownership by the calling function: we're done. + return; } + // Aquire ownership of the agent. + m_ourAgentReference.addToOwnership(address,m_objectStore); + agent.setOwner(m_ourAgentReference.getAgentAddress()); + agent.commit(); + // Update the register + ScopedExclusiveLock arl(m_agentRegister); + m_agentRegister.fetch(); + m_agentRegister.trackAgent(address); + m_agentRegister.commit(); + arl.release(); lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): will cleanup dead agent."); // Return all objects owned by the agent to their respective backup owners auto ownedObjects = agent.getOwnershipList(); @@ -214,102 +178,8 @@ void GarbageCollector::cleanupDeadAgent(const std::string & address, log::LogCon // We now processed all the owned objects. We can delete the agent's entry agent.removeAndUnregisterSelf(); lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): agent entry removed."); + // We can remove the agent from our own ownership. + m_ourAgentReference.removeFromOwnership(address, m_objectStore); } -void GarbageCollector::reinjectOwnedObject(log::LogContext& lc) { - // We have to release the agents we were following. PErformance is not an issue, so - // we go in small steps. - // First check the agents are indeed owned by us and still exist. - std::list<std::string> stillTrackedAgents; - std::list<std::string> goneAgents; - std::list<std::string> notReallyOwnedAgents; - std::list<std::string> inaccessibleAgents; - { - auto a = m_watchedAgents.begin(); - while(a!=m_watchedAgents.end()) { - auto & agentAddress=a->first; - log::ScopedParamContainer params(lc); - params.add("agentAddress", agentAddress); - // Check the agent is there, and ours. - if (!m_objectStore.exists(agentAddress)) { - goneAgents.emplace_back(agentAddress); - lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): agent not present anymore."); - } else { - try { - Agent ag(agentAddress, m_objectStore); - ScopedSharedLock agl(ag); - ag.fetch(); - if (ag.getOwner() == m_ourAgentReference.getAgentAddress()) { - stillTrackedAgents.emplace_back(agentAddress); - lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): agent still owned by us."); - } else { - params.add("currentOwner", ag.getOwner()); - notReallyOwnedAgents.emplace_back(agentAddress); - lc.log(log::ERR, "In GarbageCollector::reinjectOwnedObject(): agent not owned by us."); - } - } catch (cta::exception::Exception & ex) { - params.add("ExceptionMessage", ex.getMessageValue()); - lc.log(log::ERR, "In GarbageCollector::reinjectOwnedObject(): agent inaccessible."); - inaccessibleAgents.emplace_back(a->first); - } - } - a=m_watchedAgents.erase(a); - } - } - { - // We now have an overview of the situation. We can update the agent register based on that. - ScopedExclusiveLock arLock(m_agentRegister); - m_agentRegister.fetch(); - for (auto &sta: stillTrackedAgents) { - m_agentRegister.untrackAgent(sta); - log::ScopedParamContainer params(lc); - params.add("agentAddress", sta); - lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): untracked agent in registry."); - } - for (auto &ga: goneAgents) { - m_agentRegister.removeAgent(ga); - log::ScopedParamContainer params(lc); - params.add("agentAddress", ga); - lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): removed gone agent from registry."); - } - // This is all we are going to do. Other agents cannot be acted upon. - m_agentRegister.commit(); - } - // We can now remove ownership from the agents we still owned - for (auto & sta: stillTrackedAgents) { - log::ScopedParamContainer params(lc); - params.add("agentAddress", sta); - Agent ag (sta, m_objectStore); - ScopedExclusiveLock agl(ag); - ag.fetch(); - if (ag.getOwner() == m_ourAgentReference.getAgentAddress()) { - ag.setOwner(m_agentRegister.getAddressIfSet()); - ag.commit(); - lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): changed agent ownership to registry."); - } else { - params.add("newOwner", ag.getOwner()); - lc.log(log::ERR, "In GarbageCollector::reinjectOwnedObject(): skipping agent whose ownership we lost last minute."); - } - } - // We can now cleanup our own agent and remove it. - Agent ourAg(m_ourAgentReference.getAgentAddress(), m_objectStore); - ScopedExclusiveLock ourAgL(ourAg); - ourAg.fetch(); - std::list<std::string> allAgents; - allAgents.splice(allAgents.end(), stillTrackedAgents); - allAgents.splice(allAgents.end(), notReallyOwnedAgents); - allAgents.splice(allAgents.end(), inaccessibleAgents); - allAgents.splice(allAgents.end(), goneAgents); - for (auto & a: allAgents) { - log::ScopedParamContainer params(lc); - params.add("agentAddress", a); - ourAg.removeFromOwnership(a); - lc.log(log::INFO, "In GarbageCollector::reinjectOwnedObject(): removed agent from our ownership."); - } - ourAg.commit(); -} - - - - }} diff --git a/objectstore/GarbageCollector.hpp b/objectstore/GarbageCollector.hpp index beea0ec4a7216f33665d32707b97a34f35207c07..37706e4a711ccb4b89fd7ef81deb48ee6436d889 100644 --- a/objectstore/GarbageCollector.hpp +++ b/objectstore/GarbageCollector.hpp @@ -55,7 +55,6 @@ private: AgentReference & m_ourAgentReference; AgentRegister m_agentRegister; std::map<std::string, AgentWatchdog * > m_watchedAgents; - static const size_t c_maxWatchedAgentsPerGC; }; }} \ No newline at end of file diff --git a/tapeserver/daemon/GarbageCollectorHandler.cpp b/tapeserver/daemon/GarbageCollectorHandler.cpp index 51910179d9a6c8bfbe4ca5dca29a3b87db029017..024387697c9f74513c280d2aa3c62763c9959606 100644 --- a/tapeserver/daemon/GarbageCollectorHandler.cpp +++ b/tapeserver/daemon/GarbageCollectorHandler.cpp @@ -313,7 +313,6 @@ int GarbageCollectorHandler::runChild() { receivedMessage=true; } catch (server::SocketPair::Timeout & ex) {} } while (!receivedMessage); - gc.reinjectOwnedObject(m_processManager.logContext()); m_processManager.logContext().log(log::INFO, "In GarbageCollectorHandler::runChild(): Received shutdown message. Exiting."); } catch (cta::exception::Exception & ex) {