From 05bb51c07e936b99016db804ec503c499c769c47 Mon Sep 17 00:00:00 2001
From: Joao Afonso <joao.afonso@cern.ch>
Date: Thu, 26 Jan 2023 17:38:11 +0100
Subject: [PATCH] Minor fixes to tape server global lock timeout

---
 ReleaseNotes.md                               |  1 -
 .../ctafrontend/cc7/opt/run/bin/taped.sh      |  1 +
 scheduler/OStoreDB/OStoreDB.cpp               | 27 ++++++++++++++++---
 .../tapeserver/daemon/DataTransferSession.cpp | 12 +++++++--
 tapeserver/daemon/cta-taped.conf.example      |  3 +++
 5 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index 27c978ab78..8493420d14 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -10,7 +10,6 @@
 - cta/CTA#230 - Modify CTA code to enforce VID uppercase
 - cta/CTA#239 - Add improvments to the cta-change-storage-class tool
 - cta/CTA#248 - Clean up output from cta-readtp
-- cta/CTA#251 - Increase free drive STALE threshold to 4 hours
 - cta/CTA#218 - Do not retry during repack requests
 - cta/CTA#252 - Update cta-change-storage-class to accept json file as input
 - cta/CTA#136 - Add missing forward declarations to standalone cli tools
diff --git a/continuousintegration/docker/ctafrontend/cc7/opt/run/bin/taped.sh b/continuousintegration/docker/ctafrontend/cc7/opt/run/bin/taped.sh
index f0fa25d83e..07e8fd80f1 100755
--- a/continuousintegration/docker/ctafrontend/cc7/opt/run/bin/taped.sh
+++ b/continuousintegration/docker/ctafrontend/cc7/opt/run/bin/taped.sh
@@ -50,6 +50,7 @@ echo ${DATABASEURL} > /etc/cta/cta-catalogue.conf
   echo "taped BufferSizeBytes 262144" > /etc/cta/cta-taped.conf
   echo "taped BufferCount 200" >> /etc/cta/cta-taped.conf
   echo "taped MountCriteria 2000000, 100" >> /etc/cta/cta-taped.conf
+  echo "taped WatchdogIdleSessionTimer 2" >> /etc/cta/cta-taped.conf # Make tape servers more responsive, thus improving CI test speed
   echo "ObjectStore BackendPath $OBJECTSTOREURL" >> /etc/cta/cta-taped.conf
   echo "taped UseEncryption no" >> /etc/cta/cta-taped.conf
   echo "${tpconfig}" > /etc/cta/TPCONFIG
diff --git a/scheduler/OStoreDB/OStoreDB.cpp b/scheduler/OStoreDB/OStoreDB.cpp
index 5bc560f3db..a541efdcef 100644
--- a/scheduler/OStoreDB/OStoreDB.cpp
+++ b/scheduler/OStoreDB/OStoreDB.cpp
@@ -717,11 +717,31 @@ std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo>
   // Take an exclusive lock on the scheduling and fetch it.
   tmdi.m_schedulerGlobalLock.reset(
     new SchedulerGlobalLock(re.getSchedulerGlobalLock(), m_objectStore));
-  tmdi.m_lockOnSchedulerGlobalLock.lock(*tmdi.m_schedulerGlobalLock, globalLockTimeout_us);
+  try {
+    tmdi.m_lockOnSchedulerGlobalLock.lock(*tmdi.m_schedulerGlobalLock, globalLockTimeout_us);
+  } catch (cta::exception::TimeoutException &e) {
+    auto lockSchedGlobalTime = t.secs(utils::Timer::resetCounter);
+    log::ScopedParamContainer params(logContext);
+    params.add("rootFetchNoLockTime", rootFetchNoLockTime)
+          .add("lockSchedGlobalTime", lockSchedGlobalTime)
+          .add("fetchSchedGlobalTime", 0.0)
+          .add("status", "FAILURE");
+    logContext.log(log::INFO, "In OStoreDB::getMountInfo(): global lock acquisition.");
+    throw;
+  }
   auto lockSchedGlobalTime = t.secs(utils::Timer::resetCounter);
   tmdi.m_lockTaken = true;
   tmdi.m_schedulerGlobalLock->fetch();
-  auto fetchSchedGlobalTime = t.secs(utils::Timer::resetCounter);;
+  auto fetchSchedGlobalTime = t.secs(utils::Timer::resetCounter);
+  {
+
+    log::ScopedParamContainer params(logContext);
+    params.add("rootFetchNoLockTime", rootFetchNoLockTime)
+          .add("lockSchedGlobalTime", lockSchedGlobalTime)
+          .add("fetchSchedGlobalTime", fetchSchedGlobalTime)
+          .add("status", "SUCCESS");
+    logContext.log(log::INFO, "In OStoreDB::getMountInfo(): global lock acquisition.");
+  }
   fetchMountInfo(tmdi, re, SchedulerDatabase::PurposeGetMountInfo::GET_NEXT_MOUNT, logContext);
   auto fetchMountInfoTime = t.secs(utils::Timer::resetCounter);
   std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo> ret(std::move(privateRet));
@@ -731,8 +751,7 @@ std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo>
           .add("lockSchedGlobalTime", lockSchedGlobalTime)
           .add("fetchSchedGlobalTime", fetchSchedGlobalTime)
           .add("fetchMountInfoTime", fetchMountInfoTime);
-    if ((rootFetchNoLockTime > 1) || (lockSchedGlobalTime > 1) || (fetchSchedGlobalTime > 1) || fetchMountInfoTime > 1)
-      logContext.log(log::INFO, "In OStoreDB::getMountInfo(): success.");
+    logContext.log(log::INFO, "In OStoreDB::getMountInfo(): success.");
   }
   return ret;
 }
diff --git a/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp b/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp
index 888873bcf3..f0765033b1 100644
--- a/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp
+++ b/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp
@@ -107,6 +107,8 @@ castor::tape::tapeserver::daemon::DataTransferSession::execute() {
   TapeSessionReporter tapeServerReporter(m_initialProcess, m_driveConfig, m_hostname, lc);
 
   std::unique_ptr<cta::TapeMount> tapeMount;
+  cta::utils::Timer t;
+  bool globalLockTimeout = false;
 
   // 2a) Determine if we want to mount at all (for now)
   // This variable will allow us to see if we switched from down to up and start an
@@ -167,7 +169,11 @@ castor::tape::tapeserver::daemon::DataTransferSession::execute() {
     tapeServerReporter.reportState(cta::tape::session::SessionState::Scheduling,
                                    cta::tape::session::SessionType::Undetermined);
 
-    bool globalLockTimeout = false;
+    if (!globalLockTimeout) {
+      t.reset();
+    }
+
+    globalLockTimeout = false;
     try {
       if (m_scheduler.getNextMountDryRun(m_driveConfig.logicalLibrary, m_driveConfig.unitName, lc)) {
         tapeMount = m_scheduler.getNextMount(m_driveConfig.logicalLibrary, m_driveConfig.unitName, lc,
@@ -175,7 +181,9 @@ castor::tape::tapeserver::daemon::DataTransferSession::execute() {
       }
     } catch (cta::exception::TimeoutException &e) {
       // Print warning and try again, after refreshing the tape drive states
-      lc.log(cta::log::WARNING, "Timeout while scheduling new mount.");
+      lc.log(cta::log::WARNING,
+               "Global lock timeout while getting new mount (" + std::to_string(m_dataTransferConfig.wdGlobalLockAcqMaxSecs) + " seconds reached). "
+               "Total time trying to get new mount is " + std::to_string(t.secs()) + " seconds.");
       globalLockTimeout = true;
     } catch (cta::exception::Exception &e) {
       lc.log(cta::log::ERR, "Error while scheduling new mount. Putting the drive down. Stack trace follows.");
diff --git a/tapeserver/daemon/cta-taped.conf.example b/tapeserver/daemon/cta-taped.conf.example
index 17ef2c9ecd..34f9afe86e 100644
--- a/tapeserver/daemon/cta-taped.conf.example
+++ b/tapeserver/daemon/cta-taped.conf.example
@@ -180,3 +180,6 @@ taped externalFreeDiskSpaceScript /usr/local/bin/cta-get-free-disk-space.sh
 # 
 # Maximum time to wait after scheduling came up idle, in seconds.
 # taped WatchdogIdleSessionTimer 10
+
+# Time to wait after which the tape server stops trying to get the next mount, in seconds.
+# taped WatchdogGlobalLockAcqMaxSecs 900
-- 
GitLab