From 05bb51c07e936b99016db804ec503c499c769c47 Mon Sep 17 00:00:00 2001 From: Joao Afonso <joao.afonso@cern.ch> Date: Thu, 26 Jan 2023 17:38:11 +0100 Subject: [PATCH] Minor fixes to tape server global lock timeout --- ReleaseNotes.md | 1 - .../ctafrontend/cc7/opt/run/bin/taped.sh | 1 + scheduler/OStoreDB/OStoreDB.cpp | 27 ++++++++++++++++--- .../tapeserver/daemon/DataTransferSession.cpp | 12 +++++++-- tapeserver/daemon/cta-taped.conf.example | 3 +++ 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 27c978ab78..8493420d14 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -10,7 +10,6 @@ - cta/CTA#230 - Modify CTA code to enforce VID uppercase - cta/CTA#239 - Add improvments to the cta-change-storage-class tool - cta/CTA#248 - Clean up output from cta-readtp -- cta/CTA#251 - Increase free drive STALE threshold to 4 hours - cta/CTA#218 - Do not retry during repack requests - cta/CTA#252 - Update cta-change-storage-class to accept json file as input - cta/CTA#136 - Add missing forward declarations to standalone cli tools diff --git a/continuousintegration/docker/ctafrontend/cc7/opt/run/bin/taped.sh b/continuousintegration/docker/ctafrontend/cc7/opt/run/bin/taped.sh index f0fa25d83e..07e8fd80f1 100755 --- a/continuousintegration/docker/ctafrontend/cc7/opt/run/bin/taped.sh +++ b/continuousintegration/docker/ctafrontend/cc7/opt/run/bin/taped.sh @@ -50,6 +50,7 @@ echo ${DATABASEURL} > /etc/cta/cta-catalogue.conf echo "taped BufferSizeBytes 262144" > /etc/cta/cta-taped.conf echo "taped BufferCount 200" >> /etc/cta/cta-taped.conf echo "taped MountCriteria 2000000, 100" >> /etc/cta/cta-taped.conf + echo "taped WatchdogIdleSessionTimer 2" >> /etc/cta/cta-taped.conf # Make tape servers more responsive, thus improving CI test speed echo "ObjectStore BackendPath $OBJECTSTOREURL" >> /etc/cta/cta-taped.conf echo "taped UseEncryption no" >> /etc/cta/cta-taped.conf echo "${tpconfig}" > /etc/cta/TPCONFIG diff --git a/scheduler/OStoreDB/OStoreDB.cpp b/scheduler/OStoreDB/OStoreDB.cpp index 5bc560f3db..a541efdcef 100644 --- a/scheduler/OStoreDB/OStoreDB.cpp +++ b/scheduler/OStoreDB/OStoreDB.cpp @@ -717,11 +717,31 @@ std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo> // Take an exclusive lock on the scheduling and fetch it. tmdi.m_schedulerGlobalLock.reset( new SchedulerGlobalLock(re.getSchedulerGlobalLock(), m_objectStore)); - tmdi.m_lockOnSchedulerGlobalLock.lock(*tmdi.m_schedulerGlobalLock, globalLockTimeout_us); + try { + tmdi.m_lockOnSchedulerGlobalLock.lock(*tmdi.m_schedulerGlobalLock, globalLockTimeout_us); + } catch (cta::exception::TimeoutException &e) { + auto lockSchedGlobalTime = t.secs(utils::Timer::resetCounter); + log::ScopedParamContainer params(logContext); + params.add("rootFetchNoLockTime", rootFetchNoLockTime) + .add("lockSchedGlobalTime", lockSchedGlobalTime) + .add("fetchSchedGlobalTime", 0.0) + .add("status", "FAILURE"); + logContext.log(log::INFO, "In OStoreDB::getMountInfo(): global lock acquisition."); + throw; + } auto lockSchedGlobalTime = t.secs(utils::Timer::resetCounter); tmdi.m_lockTaken = true; tmdi.m_schedulerGlobalLock->fetch(); - auto fetchSchedGlobalTime = t.secs(utils::Timer::resetCounter);; + auto fetchSchedGlobalTime = t.secs(utils::Timer::resetCounter); + { + + log::ScopedParamContainer params(logContext); + params.add("rootFetchNoLockTime", rootFetchNoLockTime) + .add("lockSchedGlobalTime", lockSchedGlobalTime) + .add("fetchSchedGlobalTime", fetchSchedGlobalTime) + .add("status", "SUCCESS"); + logContext.log(log::INFO, "In OStoreDB::getMountInfo(): global lock acquisition."); + } fetchMountInfo(tmdi, re, SchedulerDatabase::PurposeGetMountInfo::GET_NEXT_MOUNT, logContext); auto fetchMountInfoTime = t.secs(utils::Timer::resetCounter); std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo> ret(std::move(privateRet)); @@ -731,8 +751,7 @@ std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo> .add("lockSchedGlobalTime", lockSchedGlobalTime) .add("fetchSchedGlobalTime", fetchSchedGlobalTime) .add("fetchMountInfoTime", fetchMountInfoTime); - if ((rootFetchNoLockTime > 1) || (lockSchedGlobalTime > 1) || (fetchSchedGlobalTime > 1) || fetchMountInfoTime > 1) - logContext.log(log::INFO, "In OStoreDB::getMountInfo(): success."); + logContext.log(log::INFO, "In OStoreDB::getMountInfo(): success."); } return ret; } diff --git a/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp b/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp index 888873bcf3..f0765033b1 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp @@ -107,6 +107,8 @@ castor::tape::tapeserver::daemon::DataTransferSession::execute() { TapeSessionReporter tapeServerReporter(m_initialProcess, m_driveConfig, m_hostname, lc); std::unique_ptr<cta::TapeMount> tapeMount; + cta::utils::Timer t; + bool globalLockTimeout = false; // 2a) Determine if we want to mount at all (for now) // This variable will allow us to see if we switched from down to up and start an @@ -167,7 +169,11 @@ castor::tape::tapeserver::daemon::DataTransferSession::execute() { tapeServerReporter.reportState(cta::tape::session::SessionState::Scheduling, cta::tape::session::SessionType::Undetermined); - bool globalLockTimeout = false; + if (!globalLockTimeout) { + t.reset(); + } + + globalLockTimeout = false; try { if (m_scheduler.getNextMountDryRun(m_driveConfig.logicalLibrary, m_driveConfig.unitName, lc)) { tapeMount = m_scheduler.getNextMount(m_driveConfig.logicalLibrary, m_driveConfig.unitName, lc, @@ -175,7 +181,9 @@ castor::tape::tapeserver::daemon::DataTransferSession::execute() { } } catch (cta::exception::TimeoutException &e) { // Print warning and try again, after refreshing the tape drive states - lc.log(cta::log::WARNING, "Timeout while scheduling new mount."); + lc.log(cta::log::WARNING, + "Global lock timeout while getting new mount (" + std::to_string(m_dataTransferConfig.wdGlobalLockAcqMaxSecs) + " seconds reached). " + "Total time trying to get new mount is " + std::to_string(t.secs()) + " seconds."); globalLockTimeout = true; } catch (cta::exception::Exception &e) { lc.log(cta::log::ERR, "Error while scheduling new mount. Putting the drive down. Stack trace follows."); diff --git a/tapeserver/daemon/cta-taped.conf.example b/tapeserver/daemon/cta-taped.conf.example index 17ef2c9ecd..34f9afe86e 100644 --- a/tapeserver/daemon/cta-taped.conf.example +++ b/tapeserver/daemon/cta-taped.conf.example @@ -180,3 +180,6 @@ taped externalFreeDiskSpaceScript /usr/local/bin/cta-get-free-disk-space.sh # # Maximum time to wait after scheduling came up idle, in seconds. # taped WatchdogIdleSessionTimer 10 + +# Time to wait after which the tape server stops trying to get the next mount, in seconds. +# taped WatchdogGlobalLockAcqMaxSecs 900 -- GitLab