From c940cde3e15415b91286510a13f0b28dc0bdea9f Mon Sep 17 00:00:00 2001
From: Eric Cano <Eric.Cano@cern.ch>
Date: Mon, 25 Sep 2017 10:39:02 +0200
Subject: [PATCH] Added extra checks and logs to diagnose orphaned archive
 requests.

This would detect instances where rados reports an object deletion before is actually happens.
---
 objectstore/BackendRados.cpp    |  5 ++++-
 objectstore/BackendTest.cpp     | 14 ++++++++++++++
 scheduler/OStoreDB/OStoreDB.cpp | 14 ++++++++++++--
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/objectstore/BackendRados.cpp b/objectstore/BackendRados.cpp
index ce531b8064..b0c735e943 100644
--- a/objectstore/BackendRados.cpp
+++ b/objectstore/BackendRados.cpp
@@ -425,9 +425,12 @@ void BackendRados::AsyncUpdater::fetchCallback(librados::completion_t completion
             if(updateWithDelete) {
               try {
                 au.m_backend.remove(au.m_name);
+                if (au.m_backend.exists(au.m_name)) {
+                  throw exception::Exception("Object exists after remove");
+                }
               } catch (cta::exception::Exception &ex) {
                 throw CouldNotUpdateValue(
-                    std::string("In In BackendRados::AsyncUpdater::fetchCallback::update_lambda(): failed to remove value: ") +
+                    std::string("In BackendRados::AsyncUpdater::fetchCallback::update_lambda(): failed to remove value: ") +
                     au.m_name + ex.what());
               }
               // Done!
diff --git a/objectstore/BackendTest.cpp b/objectstore/BackendTest.cpp
index eb8087cc58..495a75ec46 100644
--- a/objectstore/BackendTest.cpp
+++ b/objectstore/BackendTest.cpp
@@ -160,6 +160,20 @@ TEST_P(BackendAbstractTest, AsyncIOInterface) {
   m_os->remove(testObjectName);
 }
 
+TEST_P(BackendAbstractTest, AsyncIOInterfaceRemove) {
+  // Create object to delete in async update.
+  const std::string testValue = "1234";
+  const std::string testObjectName = "testObject";
+  try {m_os->remove(testObjectName);} catch(...){}
+  m_os->create(testObjectName, testValue);
+  // Launch update of object via asynchronous IO
+  std::function<std::string(const std::string &)> updaterCallback=
+      [&](const std::string &s)->std::string{throw cta::objectstore::Backend::AsyncUpdateWithDelete();};
+  std::unique_ptr<cta::objectstore::Backend::AsyncUpdater> updater(m_os->asyncUpdate(testObjectName,updaterCallback));
+  ASSERT_NO_THROW(updater->wait());
+  ASSERT_FALSE(m_os->exists(testObjectName));
+}
+
 TEST_P(BackendAbstractTest, AsyncIOInterfaceMultithread) {
   // Create object to update.
   const std::string testValue = "1234";
diff --git a/scheduler/OStoreDB/OStoreDB.cpp b/scheduler/OStoreDB/OStoreDB.cpp
index 07ce35116c..b8a9a234a6 100644
--- a/scheduler/OStoreDB/OStoreDB.cpp
+++ b/scheduler/OStoreDB/OStoreDB.cpp
@@ -2635,7 +2635,11 @@ void OStoreDB::ArchiveJob::bumpUpTapeFileCount(uint64_t newFileCount) {
 //------------------------------------------------------------------------------
 // OStoreDB::ArchiveJob::asyncSucceed()
 //------------------------------------------------------------------------------
-void OStoreDB::ArchiveJob::asyncSucceed() {  
+void OStoreDB::ArchiveJob::asyncSucceed() {
+  log::LogContext lc(m_logger);
+  log::ScopedParamContainer params(lc);
+  params.add("requestObject", m_archiveRequest.getAddressIfSet());
+  lc.log(log::DEBUG, "Will start async update archiveRequest for success");
   m_jobUpdate.reset(m_archiveRequest.asyncUpdateJobSuccessful(tapeFile.copyNb));
 }
 
@@ -2644,6 +2648,10 @@ void OStoreDB::ArchiveJob::asyncSucceed() {
 //------------------------------------------------------------------------------
 bool OStoreDB::ArchiveJob::checkSucceed() {  
   m_jobUpdate->wait();
+  log::LogContext lc(m_logger);
+  log::ScopedParamContainer params(lc);
+  params.add("requestObject", m_archiveRequest.getAddressIfSet());
+  lc.log(log::DEBUG, "Async update of archiveRequest for success complete");
   if (m_jobUpdate->m_isLastJob) {
     m_archiveRequest.resetValues();
   }
@@ -2651,7 +2659,9 @@ bool OStoreDB::ArchiveJob::checkSucceed() {
   m_jobOwned = false;
   // Remove ownership from agent
   const std::string atfrAddress = m_archiveRequest.getAddressIfSet();
-  m_agentReference.removeFromOwnership(atfrAddress, m_objectStore);  
+  m_agentReference.removeFromOwnership(atfrAddress, m_objectStore);
+  params.add("agentObject", m_agentReference.getAgentAddress());
+  lc.log(log::DEBUG, "Removed job from ownership");
   return m_jobUpdate->m_isLastJob;
 }
 
-- 
GitLab