diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 1f1022cb3be0c8dfbe1246b83466facc22fc3c9c..00964c6efbd81e77352ec2b9bf188f0321e16678 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,7 @@ - cta/CTA#309 - Ignore 'NoSuchObject' exceptions thrown by non-existing objects during retrieve job requeuing - cta/CTA#310 - Trigger cleanup session if taped child process did not exit with success code - cta/CTA#320 - CTA No Oracle is failing +- cta/CTA#322 - Queues with cleanup heartbeat above zero are not being picked for cleanup # v4.8.5-1 diff --git a/objectstore/QueueCleanupRunner.cpp b/objectstore/QueueCleanupRunner.cpp index 426a38f18372637faf267f01829d671fd146402f..f5536acabbbfc24856d49a23445f87119cd11789 100644 --- a/objectstore/QueueCleanupRunner.cpp +++ b/objectstore/QueueCleanupRunner.cpp @@ -43,24 +43,15 @@ void QueueCleanupRunner::runOnePass(log::LogContext &logContext) { continue; // Ignore queue } - // Check heartbeat of other queues being cleaned up + // Check heartbeat of queues to know if they are being cleaned up if (queue.assignedAgent.has_value()) { - bool newEntry = false; - - // We must register all new queues that are being cleaned up - if (m_heartbeatCheck.find(queue.vid) == m_heartbeatCheck.end()) { - newEntry = true; + if ((m_heartbeatCheck.count(queue.vid) == 0) || (m_heartbeatCheck[queue.vid].heartbeat != queue.heartbeat)) { + // If this queue was never seen before, wait for another turn to check if its heartbeat has timed out. + // If heartbeat has been updated, then the queue is being actively processed by another agent. + // Record new timestamp and move on. m_heartbeatCheck[queue.vid].agent = queue.assignedAgent.value(); m_heartbeatCheck[queue.vid].heartbeat = queue.heartbeat; m_heartbeatCheck[queue.vid].lastUpdateTimestamp = m_timer.secs(); - } - - auto oldHeartbeatValue = m_heartbeatCheck[queue.vid].heartbeat; - - if (newEntry || queue.heartbeat != oldHeartbeatValue) { - // If heartbeat has been updated, then the queue is being actively processed by another agent - // Record new timestamp and move on - m_heartbeatCheck[queue.vid].lastUpdateTimestamp = m_timer.secs(); continue; // Ignore queue } else { // If heartbeat has not been updated, check how long ago the last update happened diff --git a/scheduler/OStoreDB/OStoreDB.cpp b/scheduler/OStoreDB/OStoreDB.cpp index 5b3ed8bb469f02122c5be2492b337d9092fe6780..210feb8b6580d13d672c647a2d49b003b7e76183 100644 --- a/scheduler/OStoreDB/OStoreDB.cpp +++ b/scheduler/OStoreDB/OStoreDB.cpp @@ -2026,7 +2026,7 @@ void OStoreDB::reserveRetrieveQueueForCleanup(const std::string & vid, std::opti // Check if heartbeat has been updated, which means that another agent is still tracking it if (rq.getQueueCleanupAssignedAgent().has_value()) { - if (rq.getQueueCleanupHeartbeat() != cleanupHeartBeatValue.has_value() ? cleanupHeartBeatValue.value() : 0) { + if (rq.getQueueCleanupHeartbeat() != (cleanupHeartBeatValue.has_value() ? cleanupHeartBeatValue.value() : 0)) { throw RetrieveQueueNotReservedForCleanup("Another agent is alive and cleaning up the queue. Skipping it."); } }