diff --git a/catalogue/Catalogue.hpp b/catalogue/Catalogue.hpp index 9f4b3e2608048aaeed0a007840f6a8763375a909..b33ddcfb544cad611d1db303a5641ca60ef79875 100644 --- a/catalogue/Catalogue.hpp +++ b/catalogue/Catalogue.hpp @@ -58,6 +58,7 @@ #include "common/dataStructures/VidToTapeMap.hpp" #include "common/dataStructures/WriteTestResult.hpp" #include "common/exception/UserError.hpp" +#include "common/log/LogContext.hpp" #include "common/log/Logger.hpp" #include <list> @@ -159,13 +160,15 @@ public: * @param user The user for whom the file is to be retrieved. This will be * used by the Catalogue to determine the mount policy to be used when * retrieving the file. + * @param lc The log context. * * @return The information required to queue the associated retrieve request(s). */ virtual common::dataStructures::RetrieveFileQueueCriteria prepareToRetrieveFile( const std::string &instanceName, const uint64_t archiveFileId, - const common::dataStructures::UserIdentity &user) = 0; + const common::dataStructures::UserIdentity &user, + log::LogContext &lc) = 0; /** * Notifies the CTA catalogue that the specified tape has been mounted in @@ -498,10 +501,12 @@ public: * @param instanceName The name of the instance from where the deletion request * originated * @param archiveFileId The unique identifier of the archive file. + * @param lc The log context. * @return The metadata of the deleted archive file including the metadata of * the associated and also deleted tape copies. */ - virtual void deleteArchiveFile(const std::string &instanceName, const uint64_t archiveFileId) = 0; + virtual void deleteArchiveFile(const std::string &instanceName, const uint64_t archiveFileId, + log::LogContext &lc) = 0; /** * Returns true if the specified user running the CTA command-line tool on diff --git a/catalogue/CatalogueTest.cpp b/catalogue/CatalogueTest.cpp index e239c84b26dabd913315e82e9f078153e3d8f40c..81c2e203281b3a73f5a0114ca39acf99a199bfcc 100644 --- a/catalogue/CatalogueTest.cpp +++ b/catalogue/CatalogueTest.cpp @@ -96,7 +96,8 @@ void cta_catalogue_CatalogueTest::SetUp() { auto itor = m_catalogue->getArchiveFiles(); while(itor.hasMore()) { const auto archiveFile = itor.next(); - m_catalogue->deleteArchiveFile(archiveFile.diskInstance, archiveFile.archiveFileID); + log::LogContext dummyLc(m_dummyLog); + m_catalogue->deleteArchiveFile(archiveFile.diskInstance, archiveFile.archiveFileID, dummyLc); } } { @@ -4881,11 +4882,13 @@ TEST_P(cta_catalogue_CatalogueTest, prepareToRetrieveFile) { ASSERT_EQ(m_admin.host, rule.creationLog.host); ASSERT_EQ(rule.creationLog, rule.lastModificationLog); + log::LogContext dummyLc(m_dummyLog); + common::dataStructures::UserIdentity userIdentity; userIdentity.name = requesterName; userIdentity.group = "group"; const common::dataStructures::RetrieveFileQueueCriteria queueCriteria = - m_catalogue->prepareToRetrieveFile(diskInstanceName1, archiveFileId, userIdentity); + m_catalogue->prepareToRetrieveFile(diskInstanceName1, archiveFileId, userIdentity, dummyLc); ASSERT_EQ(2, queueCriteria.archiveFile.tapeFiles.size()); ASSERT_EQ(archivePriority, queueCriteria.mountPolicy.archivePriority); @@ -4893,7 +4896,7 @@ TEST_P(cta_catalogue_CatalogueTest, prepareToRetrieveFile) { ASSERT_EQ(maxDrivesAllowed, queueCriteria.mountPolicy.maxDrivesAllowed); // Check that the diskInstanceName mismatch detection works - ASSERT_THROW(m_catalogue->prepareToRetrieveFile(diskInstanceName2, archiveFileId, userIdentity), + ASSERT_THROW(m_catalogue->prepareToRetrieveFile(diskInstanceName2, archiveFileId, userIdentity, dummyLc), exception::UserError); } @@ -6187,7 +6190,8 @@ TEST_P(cta_catalogue_CatalogueTest, deleteArchiveFile) { ASSERT_EQ(file2Written.copyNb, tapeFile2.copyNb); } - m_catalogue->deleteArchiveFile("disk_instance", archiveFileId); + log::LogContext dummyLc(m_dummyLog); + m_catalogue->deleteArchiveFile("disk_instance", archiveFileId, dummyLc); ASSERT_FALSE(m_catalogue->getArchiveFiles().hasMore()); } @@ -6497,14 +6501,16 @@ TEST_P(cta_catalogue_CatalogueTest, deleteArchiveFile_of_another_disk_instance) ASSERT_EQ(file2Written.copyNb, tapeFile2.copyNb); } - ASSERT_THROW(m_catalogue->deleteArchiveFile("another_disk_instance", archiveFileId), exception::UserError); + log::LogContext dummyLc(m_dummyLog); + ASSERT_THROW(m_catalogue->deleteArchiveFile("another_disk_instance", archiveFileId, dummyLc), exception::UserError); } TEST_P(cta_catalogue_CatalogueTest, deleteArchiveFile_non_existant) { using namespace cta; ASSERT_FALSE(m_catalogue->getArchiveFiles().hasMore()); - m_catalogue->deleteArchiveFile("disk_instance", 12345678); + log::LogContext dummyLc(m_dummyLog); + m_catalogue->deleteArchiveFile("disk_instance", 12345678, dummyLc); } TEST_P(cta_catalogue_CatalogueTest, getTapesByVid_non_existant_tape) { @@ -6794,7 +6800,10 @@ TEST_P(cta_catalogue_CatalogueTest, reclaimTape_full_lastFSeq_1_no_tape_files) { ASSERT_EQ(creationLog, lastModificationLog); } - m_catalogue->deleteArchiveFile(diskInstanceName1, file1Written.archiveFileId); + { + log::LogContext dummyLc(m_dummyLog); + m_catalogue->deleteArchiveFile(diskInstanceName1, file1Written.archiveFileId, dummyLc); + } { const std::list<common::dataStructures::Tape> tapes = m_catalogue->getTapes(); diff --git a/catalogue/DummyCatalogue.hpp b/catalogue/DummyCatalogue.hpp index b3eb001562c73bfe50ed2487755280ad1d43f933..2d7b4cf4c906c983155496f5e363cc22e02ce98e 100644 --- a/catalogue/DummyCatalogue.hpp +++ b/catalogue/DummyCatalogue.hpp @@ -46,7 +46,7 @@ public: void createTapePool(const common::dataStructures::SecurityIdentity& admin, const std::string& name, const uint64_t nbPartialTapes, const bool encryptionValue, const std::string& comment) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void deleteAdminHost(const std::string& hostName) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void deleteAdminUser(const std::string& username) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } - void deleteArchiveFile(const std::string& instanceName, const uint64_t archiveFileId) override { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } + void deleteArchiveFile(const std::string& instanceName, const uint64_t archiveFileId, log::LogContext &lc) override { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void deleteArchiveRoute(const std::string& diskInstanceName, const std::string& storageClassName, const uint64_t copyNb) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void deleteLogicalLibrary(const std::string& name) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void deleteMountPolicy(const std::string& name) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } @@ -99,7 +99,7 @@ public: void noSpaceLeftOnTape(const std::string& vid) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void ping() { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } common::dataStructures::ArchiveFileQueueCriteria prepareForNewFile(const std::string& diskInstanceName, const std::string& storageClassName, const common::dataStructures::UserIdentity& user) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } - common::dataStructures::RetrieveFileQueueCriteria prepareToRetrieveFile(const std::string& instanceName, const uint64_t archiveFileId, const common::dataStructures::UserIdentity& user) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } + common::dataStructures::RetrieveFileQueueCriteria prepareToRetrieveFile(const std::string& instanceName, const uint64_t archiveFileId, const common::dataStructures::UserIdentity& user, log::LogContext &lc) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void reclaimTape(const common::dataStructures::SecurityIdentity& admin, const std::string& vid) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void setTapeDisabled(const common::dataStructures::SecurityIdentity& admin, const std::string& vid, const bool disabledValue) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } void setTapeFull(const common::dataStructures::SecurityIdentity& admin, const std::string& vid, const bool fullValue) { throw exception::Exception(std::string("In ")+__PRETTY_FUNCTION__+": not implemented"); } @@ -139,4 +139,4 @@ private: std::map<std::string, bool> m_tapeEnabling; }; -}} // namespace cta::catalogue. \ No newline at end of file +}} // namespace cta::catalogue. diff --git a/catalogue/OracleCatalogue.cpp b/catalogue/OracleCatalogue.cpp index d014d4994f230ab91cef499a4a656d1d03b80dd3..d6b8a055e3bd00a19380d2d0adbb3098fcf70140 100644 --- a/catalogue/OracleCatalogue.cpp +++ b/catalogue/OracleCatalogue.cpp @@ -22,6 +22,7 @@ #include "common/exception/Exception.hpp" #include "common/make_unique.hpp" #include "common/threading/MutexLocker.hpp" +#include "common/Timer.hpp" #include "common/utils/utils.hpp" #include "rdbms/AutoRollback.hpp" #include "rdbms/ConnFactoryFactory.hpp" @@ -59,7 +60,8 @@ OracleCatalogue::~OracleCatalogue() { //------------------------------------------------------------------------------ // deleteArchiveFile //------------------------------------------------------------------------------ -void OracleCatalogue::deleteArchiveFile(const std::string &diskInstanceName, const uint64_t archiveFileId) { +void OracleCatalogue::deleteArchiveFile(const std::string &diskInstanceName, const uint64_t archiveFileId, + log::LogContext &lc) { try { const char *selectSql = "SELECT " @@ -89,10 +91,15 @@ void OracleCatalogue::deleteArchiveFile(const std::string &diskInstanceName, con "WHERE " "ARCHIVE_FILE.ARCHIVE_FILE_ID = :ARCHIVE_FILE_ID " "FOR UPDATE"; + utils::Timer t; auto conn = m_connPool.getConn(); + const auto getConnTime = t.secs(utils::Timer::resetCounter); auto selectStmt = conn.createStmt(selectSql, rdbms::Stmt::AutocommitMode::OFF); + const auto createStmtTime = t.secs(); selectStmt->bindUint64(":ARCHIVE_FILE_ID", archiveFileId); + t.reset(); rdbms::Rset selectRset = selectStmt->executeQuery(); + const auto selectFromArchiveFileTime = t.secs(); std::unique_ptr<common::dataStructures::ArchiveFile> archiveFile; while(selectRset.next()) { if(nullptr == archiveFile.get()) { @@ -146,12 +153,14 @@ void OracleCatalogue::deleteArchiveFile(const std::string &diskInstanceName, con throw ue; } + t.reset(); { const char *const sql = "DELETE FROM TAPE_FILE WHERE ARCHIVE_FILE_ID = :ARCHIVE_FILE_ID"; auto stmt = conn.createStmt(sql, rdbms::Stmt::AutocommitMode::OFF); stmt->bindUint64(":ARCHIVE_FILE_ID", archiveFileId); stmt->executeNonQuery(); } + const auto deleteFromTapeFileTime = t.secs(utils::Timer::resetCounter); { const char *const sql = "DELETE FROM ARCHIVE_FILE WHERE ARCHIVE_FILE_ID = :ARCHIVE_FILE_ID"; @@ -159,23 +168,31 @@ void OracleCatalogue::deleteArchiveFile(const std::string &diskInstanceName, con stmt->bindUint64(":ARCHIVE_FILE_ID", archiveFileId); stmt->executeNonQuery(); } + const auto deleteFromArchiveFileTime = t.secs(utils::Timer::resetCounter); conn.commit(); - - std::list<cta::log::Param> params; - params.push_back(cta::log::Param("fileId", std::to_string(archiveFile->archiveFileID))); - params.push_back(cta::log::Param("diskInstance", archiveFile->diskInstance)); - params.push_back(cta::log::Param("diskFileId", archiveFile->diskFileId)); - params.push_back(cta::log::Param("diskFileInfo.path", archiveFile->diskFileInfo.path)); - params.push_back(cta::log::Param("diskFileInfo.owner", archiveFile->diskFileInfo.owner)); - params.push_back(cta::log::Param("diskFileInfo.group", archiveFile->diskFileInfo.group)); - params.push_back(cta::log::Param("diskFileInfo.recoveryBlob", archiveFile->diskFileInfo.recoveryBlob)); - params.push_back(cta::log::Param("fileSize", std::to_string(archiveFile->fileSize))); - params.push_back(cta::log::Param("checksumType", archiveFile->checksumType)); - params.push_back(cta::log::Param("checksumValue", archiveFile->checksumValue)); - params.push_back(cta::log::Param("creationTime", std::to_string(archiveFile->creationTime))); - params.push_back(cta::log::Param("reconciliationTime", std::to_string(archiveFile->reconciliationTime))); - params.push_back(cta::log::Param("storageClass", archiveFile->storageClass)); + const auto commitTime = t.secs(); + + log::ScopedParamContainer spc(lc); + spc.add("fileId", std::to_string(archiveFile->archiveFileID)) + .add("diskInstance", archiveFile->diskInstance) + .add("diskFileId", archiveFile->diskFileId) + .add("diskFileInfo.path", archiveFile->diskFileInfo.path) + .add("diskFileInfo.owner", archiveFile->diskFileInfo.owner) + .add("diskFileInfo.group", archiveFile->diskFileInfo.group) + .add("diskFileInfo.recoveryBlob", archiveFile->diskFileInfo.recoveryBlob) + .add("fileSize", std::to_string(archiveFile->fileSize)) + .add("checksumType", archiveFile->checksumType) + .add("checksumValue", archiveFile->checksumValue) + .add("creationTime", std::to_string(archiveFile->creationTime)) + .add("reconciliationTime", std::to_string(archiveFile->reconciliationTime)) + .add("storageClass", archiveFile->storageClass) + .add("getConnTime", getConnTime) + .add("createStmtTime", createStmtTime) + .add("selectFromArchiveFileTime", selectFromArchiveFileTime) + .add("deleteFromTapeFileTime", deleteFromTapeFileTime) + .add("deleteFromArchiveFileTime", deleteFromArchiveFileTime) + .add("commitTime", commitTime); for(auto it=archiveFile->tapeFiles.begin(); it!=archiveFile->tapeFiles.end(); it++) { std::stringstream tapeCopyLogStream; tapeCopyLogStream << "copy number: " << it->first @@ -187,9 +204,9 @@ void OracleCatalogue::deleteArchiveFile(const std::string &diskInstanceName, con << " checksumType: " << it->second.checksumType //this shouldn't be here: repeated field << " checksumValue: " << it->second.checksumValue //this shouldn't be here: repeated field << " copyNb: " << it->second.copyNb; //this shouldn't be here: repeated field - params.push_back(cta::log::Param("TAPE FILE", tapeCopyLogStream.str())); + spc.add("TAPE FILE", tapeCopyLogStream.str()); } - m_log(log::INFO, "Archive File Deleted", params); + lc.log(log::INFO, "Archive File Deleted"); } catch(exception::UserError &) { throw; } catch(exception::Exception &ex) { diff --git a/catalogue/OracleCatalogue.hpp b/catalogue/OracleCatalogue.hpp index df5a46d72848e3b4347df0822b19dd3a15f40a07..2f01fe976fa55477664b692b1e3bf6a6ab8f4ea6 100644 --- a/catalogue/OracleCatalogue.hpp +++ b/catalogue/OracleCatalogue.hpp @@ -77,10 +77,12 @@ public: * @param instanceName The name of the instance from where the deletion request * originated * @param archiveFileId The unique identifier of the archive file. + * @param lc The log context. * @return The metadata of the deleted archive file including the metadata of * the associated and also deleted tape copies. */ - void deleteArchiveFile(const std::string &diskInstanceName, const uint64_t archiveFileId) override; + void deleteArchiveFile(const std::string &diskInstanceName, const uint64_t archiveFileId, log::LogContext &lc) + override; /** * Returns a unique archive ID that can be used by a new archive file within diff --git a/catalogue/RdbmsCatalogue.cpp b/catalogue/RdbmsCatalogue.cpp index 6e831e2532268eca95dbef996c2bfb1a41f3d05d..eff3cceaddd2f5393e77d5caa208693974e5da90 100644 --- a/catalogue/RdbmsCatalogue.cpp +++ b/catalogue/RdbmsCatalogue.cpp @@ -25,6 +25,7 @@ #include "common/exception/UserError.hpp" #include "common/make_unique.hpp" #include "common/threading/MutexLocker.hpp" +#include "common/Timer.hpp" #include "common/utils/utils.hpp" #include "rdbms/AutoRollback.hpp" @@ -3959,10 +3960,14 @@ void RdbmsCatalogue::throwIfCommonEventDataMismatch(const common::dataStructures common::dataStructures::RetrieveFileQueueCriteria RdbmsCatalogue::prepareToRetrieveFile( const std::string &diskInstanceName, const uint64_t archiveFileId, - const common::dataStructures::UserIdentity &user) { + const common::dataStructures::UserIdentity &user, + log::LogContext &lc) { try { + cta::utils::Timer t; auto conn = m_connPool.getConn(); + const auto getConnTime = t.secs(utils::Timer::resetCounter); std::unique_ptr<common::dataStructures::ArchiveFile> archiveFile = getArchiveFile(conn, archiveFileId); + const auto getArchiveFileTime = t.secs(utils::Timer::resetCounter); if(nullptr == archiveFile.get()) { exception::Exception ex; ex.getMessage() << "Archive file with ID " << archiveFileId << " does not exist"; @@ -3977,8 +3982,17 @@ common::dataStructures::RetrieveFileQueueCriteria RdbmsCatalogue::prepareToRetri throw ue; } + t.reset(); const RequesterAndGroupMountPolicies mountPolicies = getMountPolicies(conn, diskInstanceName, user.name, user.group); + const auto getMountPoliciesTime = t.secs(utils::Timer::resetCounter); + + log::ScopedParamContainer spc(lc); + spc.add("getConnTime", getConnTime) + .add("getArchiveFileTime", getArchiveFileTime) + .add("getMountPoliciesTime", getMountPoliciesTime); + lc.log(log::INFO, "Catalogue::prepareToRetrieve internal timings"); + // Requester mount policies overrule requester group mount policies common::dataStructures::MountPolicy mountPolicy; if(!mountPolicies.requesterMountPolicies.empty()) { diff --git a/catalogue/RdbmsCatalogue.hpp b/catalogue/RdbmsCatalogue.hpp index 7868dbcf0ff06bffbab25f2be347bb46102e919e..16636832903212174f5059a6c005a6faccba7645 100644 --- a/catalogue/RdbmsCatalogue.hpp +++ b/catalogue/RdbmsCatalogue.hpp @@ -147,13 +147,15 @@ public: * @param user The user for whom the file is to be retrieved. This will be * used by the Catalogue to determine the mount policy to be used when * retrieving the file. + * @param lc The log context. * * @return The information required to queue the associated retrieve request(s). */ common::dataStructures::RetrieveFileQueueCriteria prepareToRetrieveFile( const std::string &instanceName, const uint64_t archiveFileId, - const common::dataStructures::UserIdentity &user) override; + const common::dataStructures::UserIdentity &user, + log::LogContext &lc) override; /** * Notifies the CTA catalogue that the specified tape has been mounted in diff --git a/catalogue/SqliteCatalogue.cpp b/catalogue/SqliteCatalogue.cpp index f896350a1ffef46adca07cdf416af2707d850e0e..0d3ed2ac60a693ee6d1f560d61797825da5c79eb 100644 --- a/catalogue/SqliteCatalogue.cpp +++ b/catalogue/SqliteCatalogue.cpp @@ -23,6 +23,7 @@ #include "common/exception/UserError.hpp" #include "common/make_unique.hpp" #include "common/threading/MutexLocker.hpp" +#include "common/Timer.hpp" #include "common/utils/utils.hpp" #include "rdbms/AutoRollback.hpp" #include "rdbms/ConnFactoryFactory.hpp" @@ -54,11 +55,16 @@ SqliteCatalogue::~SqliteCatalogue() { //------------------------------------------------------------------------------ // deleteArchiveFile //------------------------------------------------------------------------------ -void SqliteCatalogue::deleteArchiveFile(const std::string &diskInstanceName, const uint64_t archiveFileId) { +void SqliteCatalogue::deleteArchiveFile(const std::string &diskInstanceName, const uint64_t archiveFileId, + log::LogContext &lc) { try { + utils::Timer t; auto conn = m_connPool.getConn(); + const auto getConnTime = t.secs(); rdbms::AutoRollback autoRollback(conn); + t.reset(); const auto archiveFile = getArchiveFile(conn, archiveFileId); + const auto getArchiveFileTime = t.secs(); if(nullptr == archiveFile.get()) { std::list<cta::log::Param> params; @@ -76,12 +82,14 @@ void SqliteCatalogue::deleteArchiveFile(const std::string &diskInstanceName, con throw ue; } + t.reset(); { const char *const sql = "DELETE FROM TAPE_FILE WHERE ARCHIVE_FILE_ID = :ARCHIVE_FILE_ID;"; auto stmt = conn.createStmt(sql, rdbms::Stmt::AutocommitMode::OFF); stmt->bindUint64(":ARCHIVE_FILE_ID", archiveFileId); stmt->executeNonQuery(); } + const auto deleteFromTapeFileTime = t.secs(utils::Timer::resetCounter); { const char *const sql = "DELETE FROM ARCHIVE_FILE WHERE ARCHIVE_FILE_ID = :ARCHIVE_FILE_ID;"; @@ -89,23 +97,30 @@ void SqliteCatalogue::deleteArchiveFile(const std::string &diskInstanceName, con stmt->bindUint64(":ARCHIVE_FILE_ID", archiveFileId); stmt->executeNonQuery(); } + const auto deleteFromArchiveFileTime = t.secs(utils::Timer::resetCounter); conn.commit(); - - std::list<cta::log::Param> params; - params.push_back(cta::log::Param("fileId", std::to_string(archiveFile->archiveFileID))); - params.push_back(cta::log::Param("diskInstance", archiveFile->diskInstance)); - params.push_back(cta::log::Param("diskFileId", archiveFile->diskFileId)); - params.push_back(cta::log::Param("diskFileInfo.path", archiveFile->diskFileInfo.path)); - params.push_back(cta::log::Param("diskFileInfo.owner", archiveFile->diskFileInfo.owner)); - params.push_back(cta::log::Param("diskFileInfo.group", archiveFile->diskFileInfo.group)); - params.push_back(cta::log::Param("diskFileInfo.recoveryBlob", archiveFile->diskFileInfo.recoveryBlob)); - params.push_back(cta::log::Param("fileSize", std::to_string(archiveFile->fileSize))); - params.push_back(cta::log::Param("checksumType", archiveFile->checksumType)); - params.push_back(cta::log::Param("checksumValue", archiveFile->checksumValue)); - params.push_back(cta::log::Param("creationTime", std::to_string(archiveFile->creationTime))); - params.push_back(cta::log::Param("reconciliationTime", std::to_string(archiveFile->reconciliationTime))); - params.push_back(cta::log::Param("storageClass", archiveFile->storageClass)); + const auto commitTime = t.secs(); + + log::ScopedParamContainer spc(lc); + spc.add("fileId", std::to_string(archiveFile->archiveFileID)) + .add("diskInstance", archiveFile->diskInstance) + .add("diskFileId", archiveFile->diskFileId) + .add("diskFileInfo.path", archiveFile->diskFileInfo.path) + .add("diskFileInfo.owner", archiveFile->diskFileInfo.owner) + .add("diskFileInfo.group", archiveFile->diskFileInfo.group) + .add("diskFileInfo.recoveryBlob", archiveFile->diskFileInfo.recoveryBlob) + .add("fileSize", std::to_string(archiveFile->fileSize)) + .add("checksumType", archiveFile->checksumType) + .add("checksumValue", archiveFile->checksumValue) + .add("creationTime", std::to_string(archiveFile->creationTime)) + .add("reconciliationTime", std::to_string(archiveFile->reconciliationTime)) + .add("storageClass", archiveFile->storageClass) + .add("getConnTime", getConnTime) + .add("getArchiveFileTime", getArchiveFileTime) + .add("deleteFromTapeFileTime", deleteFromTapeFileTime) + .add("deleteFromArchiveFileTime", deleteFromArchiveFileTime) + .add("commitTime", commitTime); for(auto it=archiveFile->tapeFiles.begin(); it!=archiveFile->tapeFiles.end(); it++) { std::stringstream tapeCopyLogStream; tapeCopyLogStream << "copy number: " << it->first @@ -117,9 +132,9 @@ void SqliteCatalogue::deleteArchiveFile(const std::string &diskInstanceName, con << " checksumType: " << it->second.checksumType //this shouldn't be here: repeated field << " checksumValue: " << it->second.checksumValue //this shouldn't be here: repeated field << " copyNb: " << it->second.copyNb; //this shouldn't be here: repeated field - params.push_back(cta::log::Param("TAPE FILE", tapeCopyLogStream.str())); + spc.add("TAPE FILE", tapeCopyLogStream.str()); } - m_log(log::INFO, "Archive File Deleted", params); + lc.log(log::INFO, "Archive File Deleted"); } catch(exception::UserError &) { throw; } catch(exception::Exception &ex) { diff --git a/catalogue/SqliteCatalogue.hpp b/catalogue/SqliteCatalogue.hpp index f54fdf0adee6fa65b083960710ab7e39335e2296..91bebd4f02340fbb467aa47573c037a3766345ca 100644 --- a/catalogue/SqliteCatalogue.hpp +++ b/catalogue/SqliteCatalogue.hpp @@ -70,10 +70,12 @@ public: * @param instanceName The name of the instance from where the deletion request * originated * @param archiveFileId The unique identifier of the archive file. + * @param lc The log context. * @return The metadata of the deleted archive file including the metadata of * the associated and also deleted tape copies. */ - void deleteArchiveFile(const std::string &diskInstanceName, const uint64_t archiveFileId) override; + void deleteArchiveFile(const std::string &diskInstanceName, const uint64_t archiveFileId, log::LogContext &lc) + override; protected: diff --git a/catalogue/catalogue_common_schema.sql b/catalogue/catalogue_common_schema.sql index 6d6c13e751add236335bac4cbaa4562a6b8237ed..3184c2009c24b568e924faacf3ebb98d52f6b95f 100644 --- a/catalogue/catalogue_common_schema.sql +++ b/catalogue/catalogue_common_schema.sql @@ -1,68 +1,68 @@ CREATE TABLE CTA_CATALOGUE( - SCHEMA_VERSION_MAJOR INTEGER NOT NULL, - SCHEMA_VERSION_MINOR INTEGER NOT NULL + SCHEMA_VERSION_MAJOR INTEGER CONSTRAINT CTA_CATALOGUE_SVM1_NN NOT NULL, + SCHEMA_VERSION_MINOR INTEGER CONSTRAINT CTA_CATALOGUE_SVM2_NN NOT NULL ); CREATE TABLE ADMIN_USER( - ADMIN_USER_NAME VARCHAR2(100) NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + ADMIN_USER_NAME VARCHAR2(100) CONSTRAINT ADMIN_USER_AUN_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT ADMIN_USER_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT ADMIN_USER_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT ADMIN_USER_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT ADMIN_USER_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT ADMIN_USER_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT ADMIN_USER_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT ADMIN_USER_LUT_NN NOT NULL, CONSTRAINT ADMIN_USER_PK PRIMARY KEY(ADMIN_USER_NAME) ); CREATE TABLE ADMIN_HOST( - ADMIN_HOST_NAME VARCHAR2(100) NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + ADMIN_HOST_NAME VARCHAR2(100) CONSTRAINT ADMIN_HOST_AHN_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT ADMIN_HOST_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT ADMIN_HOST_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT ADMIN_HOST_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT ADMIN_HOST_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT ADMIN_HOST_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT ADMIN_HOST_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT ADMIN_HOST_LUT_NN NOT NULL, CONSTRAINT ADMIN_HOST_PK PRIMARY KEY(ADMIN_HOST_NAME) ); CREATE TABLE STORAGE_CLASS( - DISK_INSTANCE_NAME VARCHAR2(100) NOT NULL, - STORAGE_CLASS_NAME VARCHAR2(100) NOT NULL, - NB_COPIES INTEGER NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + DISK_INSTANCE_NAME VARCHAR2(100) CONSTRAINT STORAGE_CLASS_DIN_NN NOT NULL, + STORAGE_CLASS_NAME VARCHAR2(100) CONSTRAINT STORAGE_CLASS_SCN_NN NOT NULL, + NB_COPIES INTEGER CONSTRAINT STORAGE_CLASS_NC_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT STORAGE_CLASS_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT STORAGE_CLASS_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT STORAGE_CLASS_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT STORAGE_CLASS_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT STORAGE_CLASS_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT STORAGE_CLASS_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT STORAGE_CLASS_LUT_NN NOT NULL, CONSTRAINT STORAGE_CLASS_PK PRIMARY KEY(DISK_INSTANCE_NAME, STORAGE_CLASS_NAME) ); CREATE TABLE TAPE_POOL( - TAPE_POOL_NAME VARCHAR2(100) NOT NULL, - NB_PARTIAL_TAPES INTEGER NOT NULL, - IS_ENCRYPTED INTEGER NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + TAPE_POOL_NAME VARCHAR2(100) CONSTRAINT TAPE_POOL_TPN_NN NOT NULL, + NB_PARTIAL_TAPES INTEGER CONSTRAINT TAPE_POOL_NPT_NN NOT NULL, + IS_ENCRYPTED INTEGER CONSTRAINT TAPE_POOL_IE_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT TAPE_POOL_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT TAPE_POOL_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT TAPE_POOL_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT TAPE_POOL_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT TAPE_POOL_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT TAPE_POOL_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT TAPE_POOL_LUT_NN NOT NULL, CONSTRAINT TAPE_POOL_PK PRIMARY KEY(TAPE_POOL_NAME), CONSTRAINT TAPE_POOL_IS_ENCRYPTED_BOOL_CK CHECK(IS_ENCRYPTED IN (0, 1)) ); CREATE TABLE ARCHIVE_ROUTE( - DISK_INSTANCE_NAME VARCHAR2(100) NOT NULL, - STORAGE_CLASS_NAME VARCHAR2(100) NOT NULL, - COPY_NB INTEGER NOT NULL, - TAPE_POOL_NAME VARCHAR2(100) NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + DISK_INSTANCE_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_ROUTE_DIN_NN NOT NULL, + STORAGE_CLASS_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_ROUTE_SCN_NN NOT NULL, + COPY_NB INTEGER CONSTRAINT ARCHIVE_ROUTE_CN_NN NOT NULL, + TAPE_POOL_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_ROUTE_TPN_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT ARCHIVE_ROUTE_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_ROUTE_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_ROUTE_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT ARCHIVE_ROUTE_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_ROUTE_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_ROUTE_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT ARCHIVE_ROUTE_LUT_NN NOT NULL, CONSTRAINT ARCHIVE_ROUTE_PK PRIMARY KEY(DISK_INSTANCE_NAME, STORAGE_CLASS_NAME, COPY_NB), CONSTRAINT ARCHIVE_ROUTE_STORAGE_CLASS_FK FOREIGN KEY(DISK_INSTANCE_NAME, STORAGE_CLASS_NAME) REFERENCES STORAGE_CLASS(DISK_INSTANCE_NAME, STORAGE_CLASS_NAME), @@ -70,26 +70,26 @@ CREATE TABLE ARCHIVE_ROUTE( REFERENCES TAPE_POOL(TAPE_POOL_NAME) ); CREATE TABLE LOGICAL_LIBRARY( - LOGICAL_LIBRARY_NAME VARCHAR2(100) NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + LOGICAL_LIBRARY_NAME VARCHAR2(100) CONSTRAINT LOGICAL_LIBRARY_LLL_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT LOGICAL_LIBRARY_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT LOGICAL_LIBRARY_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT LOGICAL_LIBRARY_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT LOGICAL_LIBRARY_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT LOGICAL_LIBRARY_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT LOGICAL_LIBRARY_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT LOGICAL_LIBRARY_LUT_NN NOT NULL, CONSTRAINT LOGICAL_LIBRARY_PK PRIMARY KEY(LOGICAL_LIBRARY_NAME) ); CREATE TABLE TAPE( - VID VARCHAR2(100) NOT NULL, - LOGICAL_LIBRARY_NAME VARCHAR2(100) NOT NULL, - TAPE_POOL_NAME VARCHAR2(100) NOT NULL, + VID VARCHAR2(100) CONSTRAINT TAPE_V_UN NOT NULL, + LOGICAL_LIBRARY_NAME VARCHAR2(100) CONSTRAINT TAPE_LLN_UN NOT NULL, + TAPE_POOL_NAME VARCHAR2(100) CONSTRAINT TAPE_TPN_UN NOT NULL, ENCRYPTION_KEY VARCHAR2(100), - CAPACITY_IN_BYTES INTEGER NOT NULL, - DATA_IN_BYTES INTEGER NOT NULL, - LAST_FSEQ INTEGER NOT NULL, - IS_DISABLED INTEGER NOT NULL, - IS_FULL INTEGER NOT NULL, + CAPACITY_IN_BYTES INTEGER CONSTRAINT TAPE_CIB_UN NOT NULL, + DATA_IN_BYTES INTEGER CONSTRAINT TAPE_DIB_UN NOT NULL, + LAST_FSEQ INTEGER CONSTRAINT TAPE_LF_UN NOT NULL, + IS_DISABLED INTEGER CONSTRAINT TAPE_ID_UN NOT NULL, + IS_FULL INTEGER CONSTRAINT TAPE_IF_UN NOT NULL, LBP_IS_ON INTEGER, LABEL_DRIVE VARCHAR2(100), LABEL_TIME INTEGER, @@ -97,13 +97,13 @@ CREATE TABLE TAPE( LAST_READ_TIME INTEGER, LAST_WRITE_DRIVE VARCHAR2(100), LAST_WRITE_TIME INTEGER, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT TAPE_UC_UN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT TAPE_CLUN_UN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT TAPE_CLHN_UN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT TAPE_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT TAPE_LUUN_UN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT TAPE_LUHN_UN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT TAPE_LUT_UN NOT NULL, CONSTRAINT TAPE_PK PRIMARY KEY(VID), CONSTRAINT TAPE_LOGICAL_LIBRARY_FK FOREIGN KEY(LOGICAL_LIBRARY_NAME) REFERENCES LOGICAL_LIBRARY(LOGICAL_LIBRARY_NAME), @@ -114,77 +114,77 @@ CREATE TABLE TAPE( CONSTRAINT TAPE_LBP_IS_ON_BOOL_CK CHECK(LBP_IS_ON = 0 OR LBP_IS_ON = 1) ); CREATE TABLE MOUNT_POLICY( - MOUNT_POLICY_NAME VARCHAR2(100) NOT NULL, - ARCHIVE_PRIORITY INTEGER NOT NULL, - ARCHIVE_MIN_REQUEST_AGE INTEGER NOT NULL, - RETRIEVE_PRIORITY INTEGER NOT NULL, - RETRIEVE_MIN_REQUEST_AGE INTEGER NOT NULL, - MAX_DRIVES_ALLOWED INTEGER NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + MOUNT_POLICY_NAME VARCHAR2(100) CONSTRAINT MOUNT_POLICY_MPN_NN NOT NULL, + ARCHIVE_PRIORITY INTEGER CONSTRAINT MOUNT_POLICY_AP_NN NOT NULL, + ARCHIVE_MIN_REQUEST_AGE INTEGER CONSTRAINT MOUNT_POLICY_AMRA_NN NOT NULL, + RETRIEVE_PRIORITY INTEGER CONSTRAINT MOUNT_POLICY_RP_NN NOT NULL, + RETRIEVE_MIN_REQUEST_AGE INTEGER CONSTRAINT MOUNT_POLICY_RMRA_NN NOT NULL, + MAX_DRIVES_ALLOWED INTEGER CONSTRAINT MOUNT_POLICY_MDA_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT MOUNT_POLICY_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT MOUNT_POLICY_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT MOUNT_POLICY_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT MOUNT_POLICY_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT MOUNT_POLICY_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT MOUNT_POLICY_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT MOUNT_POLICY_LUT_NN NOT NULL, CONSTRAINT MOUNT_POLICY_PK PRIMARY KEY(MOUNT_POLICY_NAME) ); CREATE TABLE REQUESTER_MOUNT_RULE( - DISK_INSTANCE_NAME VARCHAR2(100) NOT NULL, - REQUESTER_NAME VARCHAR2(100) NOT NULL, - MOUNT_POLICY_NAME VARCHAR2(100) NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + DISK_INSTANCE_NAME VARCHAR2(100) CONSTRAINT RQSTER_RULE_DIN_NN NOT NULL, + REQUESTER_NAME VARCHAR2(100) CONSTRAINT RQSTER_RULE_RN_NN NOT NULL, + MOUNT_POLICY_NAME VARCHAR2(100) CONSTRAINT RQSTER_RULE_MPN_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT RQSTER_RULE_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT RQSTER_RULE_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT RQSTER_RULE_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT RQSTER_RULE_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT RQSTER_RULE_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT RQSTER_RULE_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT RQSTER_RULE_LUT_NN NOT NULL, CONSTRAINT RQSTER_RULE_PK PRIMARY KEY(DISK_INSTANCE_NAME, REQUESTER_NAME), CONSTRAINT RQSTER_RULE_MNT_PLC_FK FOREIGN KEY(MOUNT_POLICY_NAME) REFERENCES MOUNT_POLICY(MOUNT_POLICY_NAME) ); CREATE TABLE REQUESTER_GROUP_MOUNT_RULE( - DISK_INSTANCE_NAME VARCHAR2(100) NOT NULL, - REQUESTER_GROUP_NAME VARCHAR2(100) NOT NULL, - MOUNT_POLICY_NAME VARCHAR2(100) NOT NULL, - USER_COMMENT VARCHAR2(1000) NOT NULL, - CREATION_LOG_USER_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_HOST_NAME VARCHAR2(100) NOT NULL, - CREATION_LOG_TIME INTEGER NOT NULL, - LAST_UPDATE_USER_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_HOST_NAME VARCHAR2(100) NOT NULL, - LAST_UPDATE_TIME INTEGER NOT NULL, + DISK_INSTANCE_NAME VARCHAR2(100) CONSTRAINT RQSTER_GRP_RULE_DIN_NN NOT NULL, + REQUESTER_GROUP_NAME VARCHAR2(100) CONSTRAINT RQSTER_GRP_RULE_RGN_NN NOT NULL, + MOUNT_POLICY_NAME VARCHAR2(100) CONSTRAINT RQSTER_GRP_RULE_MPN_NN NOT NULL, + USER_COMMENT VARCHAR2(1000) CONSTRAINT RQSTER_GRP_RULE_UC_NN NOT NULL, + CREATION_LOG_USER_NAME VARCHAR2(100) CONSTRAINT RQSTER_GRP_RULE_CLUN_NN NOT NULL, + CREATION_LOG_HOST_NAME VARCHAR2(100) CONSTRAINT RQSTER_GRP_RULE_CLHN_NN NOT NULL, + CREATION_LOG_TIME INTEGER CONSTRAINT RQSTER_GRP_RULE_CLT_NN NOT NULL, + LAST_UPDATE_USER_NAME VARCHAR2(100) CONSTRAINT RQSTER_GRP_RULE_LUUN_NN NOT NULL, + LAST_UPDATE_HOST_NAME VARCHAR2(100) CONSTRAINT RQSTER_GRP_RULE_LUHN_NN NOT NULL, + LAST_UPDATE_TIME INTEGER CONSTRAINT RQSTER_GRP_RULE_LUT_NN NOT NULL, CONSTRAINT RQSTER_GRP_RULE_PK PRIMARY KEY(DISK_INSTANCE_NAME, REQUESTER_GROUP_NAME), CONSTRAINT RQSTER_GRP_RULE_MNT_PLC_FK FOREIGN KEY(MOUNT_POLICY_NAME) REFERENCES MOUNT_POLICY(MOUNT_POLICY_NAME) ); CREATE TABLE ARCHIVE_FILE( - ARCHIVE_FILE_ID INTEGER NOT NULL, - DISK_INSTANCE_NAME VARCHAR2(100) NOT NULL, - DISK_FILE_ID VARCHAR2(100) NOT NULL, - DISK_FILE_PATH VARCHAR2(2000) NOT NULL, - DISK_FILE_USER VARCHAR2(100) NOT NULL, - DISK_FILE_GROUP VARCHAR2(100) NOT NULL, - DISK_FILE_RECOVERY_BLOB VARCHAR2(4000) NOT NULL, - SIZE_IN_BYTES INTEGER NOT NULL, - CHECKSUM_TYPE VARCHAR2(100) NOT NULL, - CHECKSUM_VALUE VARCHAR2(100) NOT NULL, - STORAGE_CLASS_NAME VARCHAR2(100) NOT NULL, - CREATION_TIME INTEGER NOT NULL, - RECONCILIATION_TIME INTEGER NOT NULL, + ARCHIVE_FILE_ID INTEGER CONSTRAINT ARCHIVE_FILE_AFI_NN NOT NULL, + DISK_INSTANCE_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_FILE_DIN_NN NOT NULL, + DISK_FILE_ID VARCHAR2(100) CONSTRAINT ARCHIVE_FILE_DFI_NN NOT NULL, + DISK_FILE_PATH VARCHAR2(2000) CONSTRAINT ARCHIVE_FILE_DFP_NN NOT NULL, + DISK_FILE_USER VARCHAR2(100) CONSTRAINT ARCHIVE_FILE_DFU_NN NOT NULL, + DISK_FILE_GROUP VARCHAR2(100) CONSTRAINT ARCHIVE_FILE_DFG_NN NOT NULL, + DISK_FILE_RECOVERY_BLOB VARCHAR2(4000) CONSTRAINT ARCHIVE_FILE_DFRB_NN NOT NULL, + SIZE_IN_BYTES INTEGER CONSTRAINT ARCHIVE_FILE_SIB_NN NOT NULL, + CHECKSUM_TYPE VARCHAR2(100) CONSTRAINT ARCHIVE_FILE_CT1_NN NOT NULL, + CHECKSUM_VALUE VARCHAR2(100) CONSTRAINT ARCHIVE_FILE_CV_NN NOT NULL, + STORAGE_CLASS_NAME VARCHAR2(100) CONSTRAINT ARCHIVE_FILE_SCN_NN NOT NULL, + CREATION_TIME INTEGER CONSTRAINT ARCHIVE_FILE_CT2_NN NOT NULL, + RECONCILIATION_TIME INTEGER CONSTRAINT ARCHIVE_FILE_RT_NN NOT NULL, CONSTRAINT ARCHIVE_FILE_PK PRIMARY KEY(ARCHIVE_FILE_ID), CONSTRAINT ARCHIVE_FILE_STORAGE_CLASS_FK FOREIGN KEY(DISK_INSTANCE_NAME, STORAGE_CLASS_NAME) REFERENCES STORAGE_CLASS(DISK_INSTANCE_NAME, STORAGE_CLASS_NAME) ); CREATE TABLE TAPE_FILE( - VID VARCHAR2(100) NOT NULL, - FSEQ INTEGER NOT NULL, - BLOCK_ID INTEGER NOT NULL, - COMPRESSED_SIZE_IN_BYTES INTEGER NOT NULL, - COPY_NB INTEGER NOT NULL, - CREATION_TIME INTEGER NOT NULL, - ARCHIVE_FILE_ID INTEGER NOT NULL, + VID VARCHAR2(100) CONSTRAINT TAPE_FILE_V_NN NOT NULL, + FSEQ INTEGER CONSTRAINT TAPE_FILE_F_NN NOT NULL, + BLOCK_ID INTEGER CONSTRAINT TAPE_FILE_BI_NN NOT NULL, + COMPRESSED_SIZE_IN_BYTES INTEGER CONSTRAINT TAPE_FILE_CSIB_NN NOT NULL, + COPY_NB INTEGER CONSTRAINT TAPE_FILE_CN_NN NOT NULL, + CREATION_TIME INTEGER CONSTRAINT TAPE_FILE_CT_NN NOT NULL, + ARCHIVE_FILE_ID INTEGER CONSTRAINT TAPE_FILE_AFI_NN NOT NULL, CONSTRAINT TAPE_FILE_PK PRIMARY KEY(VID, FSEQ), CONSTRAINT TAPE_FILE_TAPE_FK FOREIGN KEY(VID) REFERENCES TAPE(VID), diff --git a/common/dataStructures/DriveStatus.cpp b/common/dataStructures/DriveStatus.cpp index 4070a08d549bd4e71f1128e3c0a775989b4f6487..c28f5465b9337cc85711e2500210330c88ee538c 100644 --- a/common/dataStructures/DriveStatus.cpp +++ b/common/dataStructures/DriveStatus.cpp @@ -25,7 +25,7 @@ std::string cta::common::dataStructures::toString(cta::common::dataStructures::D case cta::common::dataStructures::DriveStatus::Down: return "Down"; case cta::common::dataStructures::DriveStatus::Up: - return "Up"; + return "Free"; case cta::common::dataStructures::DriveStatus::Starting: return "Start"; case cta::common::dataStructures::DriveStatus::Mounting: diff --git a/continuousintegration/buildtree_runner/recreate_buildtree_running_environment.sh b/continuousintegration/buildtree_runner/recreate_buildtree_running_environment.sh index 5a5378338fc5c487661f8dbd28bb2b87dd6af4c2..ed722256e5a5305d473d6b0c5b2ceafffb9122e5 100755 --- a/continuousintegration/buildtree_runner/recreate_buildtree_running_environment.sh +++ b/continuousintegration/buildtree_runner/recreate_buildtree_running_environment.sh @@ -8,9 +8,11 @@ echo Wiping mhvtl and kubernetes library info... systemctl stop mhvtl || true sleep 2 rm -rf /etc/mhvtl +umount /opt/mhvtl || true rm -rf /opt/mhvtl rm -rf /opt/kubernetes/CTA/library mkdir -p /opt/mhvtl +mount -t tmpfs -o size=512m tmpfs /opt/mhvtl ################################################################################ ### puppet:///modules/hg_cta/00-cta-tape.rules diff --git a/continuousintegration/orchestration/tests/client_ar.sh b/continuousintegration/orchestration/tests/client_ar.sh index 0640daa3b8a22d939fbd57f81329befcdf2a58e8..7d135d2387a13411499bc146af8f9dd6ac58d03d 100644 --- a/continuousintegration/orchestration/tests/client_ar.sh +++ b/continuousintegration/orchestration/tests/client_ar.sh @@ -78,7 +78,7 @@ done | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME xrdcp --silent /tmp/testfi echo Done. -eos root://${EOSINSTANCE} ls ${EOS_DIR} | egrep "${TEST_FILE_NAME_BASE}[0-9]+" | sed -e 's/$/ copied/' > ${STATUS_FILE=} +eos root://${EOSINSTANCE} ls ${EOS_DIR} | egrep "${TEST_FILE_NAME_BASE}[0-9]+" | sed -e 's/$/ copied/' > ${STATUS_FILE} echo "Waiting for files to be on tape:" SECONDS_PASSED=0 @@ -176,7 +176,7 @@ test -z $TAILPID || kill ${TAILPID} test ${RETRIEVED} = ${NB_FILES} && exit 0 -echo "ERROR there were some lost files during the archive/retrieve test with ${NB_FILES} files:" -grep -v retrieved ${STATUS_FILE} | sed -e "s;^;${EOS_DIR}/;" +echo "ERROR there were some lost files during the archive/retrieve test with ${NB_FILES} files (first 10):" +grep -v retrieved ${STATUS_FILE} | sed -e "s;^;${EOS_DIR}/;" | head -10 exit 1 diff --git a/eos/DiskReporter.hpp b/eos/DiskReporter.hpp index affaf001de8d8cf8f6cc57084ea6950e6d2c3378..72441f2232a71e804094895f99a172ce9fe538f3 100644 --- a/eos/DiskReporter.hpp +++ b/eos/DiskReporter.hpp @@ -23,6 +23,7 @@ namespace cta { namespace eos { class DiskReporter { public: virtual void reportArchiveFullyComplete() = 0; + virtual void asyncReportArchiveFullyComplete() = 0; virtual ~DiskReporter() {}; }; diff --git a/eos/DiskReporterFactory.cpp b/eos/DiskReporterFactory.cpp index a49735eb03c1e171225c726c0695e705ab4d9cfc..4be5df6936e94e57a2cfcf6f844a9472741bdb14 100644 --- a/eos/DiskReporterFactory.cpp +++ b/eos/DiskReporterFactory.cpp @@ -24,14 +24,15 @@ namespace cta { namespace eos { -DiskReporter* DiskReporterFactory::createDiskReporter(const std::string URL) { +DiskReporter* DiskReporterFactory::createDiskReporter(const std::string URL, std::promise<void> &reporterState) { threading::MutexLocker ml(m_mutex); auto regexResult = m_EosUrlRegex.exec(URL); if (regexResult.size()) { - return new EOSReporter(regexResult[1], regexResult[2]); + return new EOSReporter(regexResult[1], regexResult[2], reporterState); } regexResult = m_NullRegex.exec(URL); if (regexResult.size()) { + reporterState.set_value(); return new NullReporter(); } throw cta::exception::Exception( diff --git a/eos/DiskReporterFactory.hpp b/eos/DiskReporterFactory.hpp index 3c52eb459675a3c0838bc692c6b3a9dbb51a9d25..be3ffac3c78acfc1279e8f697da71ad95861250b 100644 --- a/eos/DiskReporterFactory.hpp +++ b/eos/DiskReporterFactory.hpp @@ -23,12 +23,13 @@ #include "common/threading/Mutex.hpp" #include <string> +#include <future> namespace cta { namespace eos { class DiskReporterFactory { public: - DiskReporter * createDiskReporter(const std::string URL); + DiskReporter * createDiskReporter(const std::string URL, std::promise<void> &sreporterState); private: // The typical call to give report to EOS will be: // xrdfs localhost query opaquefile "/eos/wfe/passwd?mgm.pcmd=event&mgm.fid=112&mgm.logid=cta&mgm.event=migrated&mgm.workflow=default&mgm.path=/eos/wfe/passwd&mgm.ruid=0&mgm.rgid=0" diff --git a/eos/EOSReporter.cpp b/eos/EOSReporter.cpp index d48967dbf9b3c651c132ba356dd7f6f901ce3571..4905da0f3933d37c979c580591bad15ed263c80e 100644 --- a/eos/EOSReporter.cpp +++ b/eos/EOSReporter.cpp @@ -16,13 +16,15 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <future> + #include "EOSReporter.hpp" #include "common/exception/XrootCl.hpp" namespace cta { namespace eos { -EOSReporter::EOSReporter(const std::string& hostURL, const std::string& queryValue): - m_fs(hostURL), m_query(queryValue) {} +EOSReporter::EOSReporter(const std::string& hostURL, const std::string& queryValue, std::promise<void>& reporterState): + m_fs(hostURL), m_query(queryValue), m_reporterState(reporterState) {} void EOSReporter::reportArchiveFullyComplete() { @@ -30,11 +32,48 @@ void EOSReporter::reportArchiveFullyComplete() { XrdCl::Buffer arg (m_query.size()); arg.FromString(m_query); XrdCl::Buffer * resp = nullptr; - const uint16_t queryTimeout = 15; // Timeout in seconds that is rounded up to the nearest 15 seconds - XrdCl::XRootDStatus status=m_fs.Query(qcOpaque, arg, resp, queryTimeout); + XrdCl::XRootDStatus status=m_fs.Query(qcOpaque, arg, resp, CTA_EOS_QUERY_TIMEOUT); delete (resp); cta::exception::XrootCl::throwOnError(status, "In EOSReporter::reportArchiveFullyComplete(): failed to XrdCl::FileSystem::Query()"); } +void EOSReporter::asyncReportArchiveFullyComplete() { + auto qcOpaque = XrdCl::QueryCode::OpaqueFile; + XrdCl::Buffer arg (m_query.size()); + arg.FromString(m_query); + AsyncQueryHandler *handler = new AsyncQueryHandler(m_reporterState); + XrdCl::XRootDStatus status=m_fs.Query( qcOpaque, arg, handler, CTA_EOS_QUERY_TIMEOUT); + cta::exception::XrootCl::throwOnError(status, + "In EOSReporter::asyncReportArchiveFullyComplete(): failed to XrdCl::FileSystem::Query()"); +} + +//------------------------------------------------------------------------------ +//EOSReporter::AsyncQueryHandler::AsyncQueryHandler +//------------------------------------------------------------------------------ +EOSReporter::AsyncQueryHandler::AsyncQueryHandler(std::promise<void> &handlerPromise): + m_handlerPromise(handlerPromise) {} + +//------------------------------------------------------------------------------ +//EOSReporter::AsyncQueryHandler::HandleResponse +//------------------------------------------------------------------------------ +void EOSReporter::AsyncQueryHandler::HandleResponse(XrdCl::XRootDStatus *status, + XrdCl::AnyObject *response) { + try { + cta::exception::XrootCl::throwOnError(*status, + "In EOSReporter::AsyncQueryHandler::HandleResponse(): failed to XrdCl::FileSystem::Query()"); + } catch (...) { + try { + // store anything thrown in the promise + m_handlerPromise.set_exception(std::current_exception()); + } catch(...) { + // set_exception() may throw too + } + } + + m_handlerPromise.set_value(); + delete response; + delete status; + delete this; + } }} // namespace cta::disk diff --git a/eos/EOSReporter.hpp b/eos/EOSReporter.hpp index c3c78de642ea5099dcf31080199e9668f13cc587..c35fe5490910891653a18aa2145809bed9ef0ed2 100644 --- a/eos/EOSReporter.hpp +++ b/eos/EOSReporter.hpp @@ -21,15 +21,28 @@ #include "DiskReporter.hpp" #include <XrdCl/XrdClFileSystem.hh> -namespace cta { namespace eos { +#include <future> +namespace cta { namespace eos { +const uint16_t CTA_EOS_QUERY_TIMEOUT = 15; // Timeout in seconds that is rounded up to the nearest 15 seconds + class EOSReporter: public DiskReporter { public: - EOSReporter(const std::string & hostURL, const std::string & queryValue); + EOSReporter(const std::string & hostURL, const std::string & queryValue, std::promise<void> &reporterState); void reportArchiveFullyComplete() override; + void asyncReportArchiveFullyComplete() override; private: XrdCl::FileSystem m_fs; std::string m_query; + std::promise<void> &m_reporterState; + class AsyncQueryHandler: public XrdCl::ResponseHandler { + public: + AsyncQueryHandler(std::promise<void> &handlerPromise); + virtual void HandleResponse(XrdCl::XRootDStatus *status, + XrdCl::AnyObject *response); + private: + std::promise<void> &m_handlerPromise; + }; }; -}} // namespace cta::disk \ No newline at end of file +}} // namespace cta::disk diff --git a/eos/NullReporter.hpp b/eos/NullReporter.hpp index e4d66ebc2239ef2ffcec2e7e24b039e2f7356ce4..86d1786a5c7c923275046972f13397ec4befbad9 100644 --- a/eos/NullReporter.hpp +++ b/eos/NullReporter.hpp @@ -26,6 +26,7 @@ class NullReporter: public DiskReporter { public: NullReporter() {}; void reportArchiveFullyComplete() override {}; + void asyncReportArchiveFullyComplete() override {}; }; }} // namespace cta::disk \ No newline at end of file diff --git a/eos_wfe_scripts/create_tape_drop_disk_replicas b/eos_wfe_scripts/create_tape_drop_disk_replicas index 0876f258bd10ab7ac5a109d558d1b355ec33c70b..12acea3666a2c401387a444282d2171d63399038 100755 --- a/eos_wfe_scripts/create_tape_drop_disk_replicas +++ b/eos_wfe_scripts/create_tape_drop_disk_replicas @@ -16,16 +16,37 @@ LOG_DATE=`/usr/bin/date +%s | /usr/bin/tr '\n' ' ' ; /usr/bin/date` LOG_SCRIPT_NAME=`/usr/bin/basename $0` LOG_FILE="/var/log/eos/wfe/${WF_TAG}.log" -echo "$LOG_DATE $LOG_SCRIPT_NAME creating tape replica with fsid ${TAPE_FS_ID} for ${FILE_PATH}" >> ${LOG_FILE} -if ! eos -r 0 0 file tag "${FILE_PATH}" +${TAPE_FS_ID}; then - echo "$LOG_DATE $LOG_SCRIPT_NAME failed to create tape replica with fsid ${TAPE_FS_ID} for ${FILE_PATH}" >> ${LOG_FILE} +# Creating tape replica +echo "${LOG_DATE} ${LOG_SCRIPT_NAME} creating tape replica with fsid ${TAPE_FS_ID} for ${FILE_PATH}" >> ${LOG_FILE} +OUTPUT=`/usr/bin/eos -r 0 0 file tag "${FILE_PATH}" +${TAPE_FS_ID} 2>&1` +RESULT=$? +if [ 0 -ne ${RESULT} ]; then + echo "${LOG_DATE} ${LOG_SCRIPT_NAME} failed to create tape replica with fsid ${TAPE_FS_ID} for ${FILE_PATH}: ${RESULT} ${OUTPUT}" >> ${LOG_FILE} exit 1 fi -for DISK_FSID in `eos file info "${FILE_PATH}" -m | sed s/\ /'\n'/g | grep fsid | sed s/fsid=// | grep -v ${TAPE_FS_ID}`; do - echo "$LOG_DATE $LOG_SCRIPT_NAME deleting disk replica with fsid ${DISK_FSID} for ${FILE_PATH}" >> ${LOG_FILE} - if ! eos -r 0 0 file drop "${FILE_PATH}" ${DISK_FSID}; then - echo "$LOG_DATE $LOG_SCRIPT_NAME failed to delete disk replica with fsid ${DISK_FSID} for ${FILE_PATH}" >> ${LOG_FILE} +# Checking tape replica +#echo "${LOG_DATE} ${LOG_SCRIPT_NAME} checking tape replica for ${FILE_PATH}" >> ${LOG_FILE} +OUTPUT=`/usr/bin/eos -r 0 0 attr ls "${FILE_PATH}" 2>&1 | /usr/bin/grep -c '^sys.archiveFileId=\"'` +RESULT=$? +if [ 1 -gt ${OUTPUT} ]; then + echo "${LOG_DATE} ${LOG_SCRIPT_NAME} missing tape replica sys.archiveFileId for ${FILE_PATH}: ${RESULT} ${OUTPUT}" >> ${LOG_FILE} + exit 1 +fi +OUTPUT=`/usr/bin/eos -r 0 0 ls -y "${FILE_PATH}" 2>&1 | /usr/bin/grep -c '^d.::t[123456789]'` +RESULT=$? +if [ 1 -gt ${OUTPUT} ]; then + echo "${LOG_DATE} ${LOG_SCRIPT_NAME} tape replica (expecting at least d?::t1) missing in EOS for ${FILE_PATH}: ${RESULT} ${OUTPUT}" >> ${LOG_FILE} + exit 1 +fi + +# Deleting disk replica +for DISK_FSID in `/usr/bin/eos file info "${FILE_PATH}" -m | /usr/bin/sed s/\ /'\n'/g | /usr/bin/grep fsid | /usr/bin/sed s/fsid=// | /usr/bin/grep -v ${TAPE_FS_ID}`; do + echo "${LOG_DATE} ${LOG_SCRIPT_NAME} deleting disk replica with fsid ${DISK_FSID} for ${FILE_PATH}" >> ${LOG_FILE} + OUTPUT=`/usr/bin/eos -r 0 0 file drop "${FILE_PATH}" ${DISK_FSID} 2>&1` + RESULT=$? + if [ 0 -ne ${RESULT} ]; then + echo "${LOG_DATE} ${LOG_SCRIPT_NAME} failed to delete disk replica with fsid ${DISK_FSID} for ${FILE_PATH}: ${RESULT} ${OUTPUT}" >> ${LOG_FILE} exit 1 fi done diff --git a/objectstore/ArchiveRequest.cpp b/objectstore/ArchiveRequest.cpp index fd781dacbac99d71afdb864ec859c15b84c52da0..6f586b18a4c35b002f101a996e83fb8668736365 100644 --- a/objectstore/ArchiveRequest.cpp +++ b/objectstore/ArchiveRequest.cpp @@ -25,6 +25,7 @@ #include <algorithm> #include <google/protobuf/util/json_util.h> +#include <cmath> namespace cta { namespace objectstore { @@ -301,11 +302,12 @@ void ArchiveRequest::garbageCollect(const std::string &presumedOwner, AgentRefer std::string queueObject="Not defined yet"; anythingGarbageCollected=true; try { + utils::Timer t; // Get the queue where we should requeue the job. The queue might need to be // recreated (this will be done by helper). ArchiveQueue aq(m_objectStore); - ScopedExclusiveLock tpl; - Helpers::getLockedAndFetchedQueue<ArchiveQueue>(aq, tpl, agentReference, j->tapepool(), lc); + ScopedExclusiveLock aql; + Helpers::getLockedAndFetchedQueue<ArchiveQueue>(aq, aql, agentReference, j->tapepool(), lc); queueObject=aq.getAddressIfSet(); ArchiveRequest::JobDump jd; jd.copyNb = j->copynb(); @@ -314,14 +316,32 @@ void ArchiveRequest::garbageCollect(const std::string &presumedOwner, AgentRefer if (aq.addJobIfNecessary(jd, getAddressIfSet(), getArchiveFile().archiveFileID, getArchiveFile().fileSize, getMountPolicy(), getEntryLog().time)) aq.commit(); + auto queueUpdateTime = t.secs(utils::Timer::resetCounter); j->set_owner(aq.getAddressIfSet()); j->set_status(serializers::AJS_PendingMount); commit(); + aql.release(); + auto commitUnlockQueueTime = t.secs(utils::Timer::resetCounter); + timespec ts; + // We will sleep a bit to make sure other processes can also access the queue + // as we are very likely to be part of a tight loop. + // TODO: ideally, end of session requeueing and garbage collection should be + // done in parallel. + // We sleep half the time it took to queue to give a chance to other lockers. + double secSleep, fracSecSleep; + fracSecSleep = std::modf(queueUpdateTime / 2, &secSleep); + ts.tv_sec = secSleep; + ts.tv_nsec = std::round(fracSecSleep * 1000 * 1000 * 1000); + nanosleep(&ts, nullptr); + auto sleepTime = t.secs(); log::ScopedParamContainer params(lc); params.add("jobObject", getAddressIfSet()) .add("queueObject", queueObject) .add("presumedOwner", presumedOwner) - .add("copyNb", j->copynb()); + .add("copyNb", j->copynb()) + .add("queueUpdateTime", queueUpdateTime) + .add("commitUnlockQueueTime", commitUnlockQueueTime) + .add("sleepTime", sleepTime); lc.log(log::INFO, "In ArchiveRequest::garbageCollect(): requeued job."); } catch (...) { // We could not requeue the job: fail it. @@ -461,6 +481,51 @@ const std::string& ArchiveRequest::AsyncJobOwnerUpdater::getSrcURL() { return m_srcURL; } +ArchiveRequest::AsyncJobSuccessfulUpdater * ArchiveRequest::asyncUpdateJobSuccessful(const uint16_t copyNumber ) { + std::unique_ptr<AsyncJobSuccessfulUpdater> ret(new AsyncJobSuccessfulUpdater); + // Passing a reference to the unique pointer led to strange behaviors. + auto & retRef = *ret; + ret->m_updaterCallback= + [this,copyNumber, &retRef](const std::string &in)->std::string { + // We have a locked and fetched object, so we just need to work on its representation. + serializers::ObjectHeader oh; + oh.ParseFromString(in); + if (oh.type() != serializers::ObjectType::ArchiveRequest_t) { + std::stringstream err; + err << "In ArchiveRequest::asyncUpdateJobSuccessful()::lambda(): wrong object type: " << oh.type(); + throw cta::exception::Exception(err.str()); + } + serializers::ArchiveRequest payload; + payload.ParseFromString(oh.payload()); + auto * jl = payload.mutable_jobs(); + for (auto j=jl->begin(); j!=jl->end(); j++) { + if (j->copynb() == copyNumber) { + j->set_status(serializers::ArchiveJobStatus::AJS_Complete); + for (auto j2=jl->begin(); j2!=jl->end(); j2++) { + if (j2->status()!= serializers::ArchiveJobStatus::AJS_Complete && + j2->status()!= serializers::ArchiveJobStatus::AJS_Failed) { + retRef.m_isLastJob = false; + oh.set_payload(payload.SerializePartialAsString()); + return oh.SerializeAsString(); + } + } + retRef.m_isLastJob = true; + oh.set_payload(payload.SerializePartialAsString()); + throw cta::objectstore::Backend::AsyncUpdateWithDelete(oh.SerializeAsString()); + } + } + std::stringstream err; + err << "In ArchiveRequest::asyncUpdateJobSuccessful()::lambda(): copyNb not found"; + throw cta::exception::Exception(err.str()); + }; + ret->m_backendUpdater.reset(m_objectStore.asyncUpdate(getAddressIfSet(), ret->m_updaterCallback)); + return ret.release(); +} + +void ArchiveRequest::AsyncJobSuccessfulUpdater::wait() { + m_backendUpdater->wait(); +} + std::string ArchiveRequest::getJobOwner(uint16_t copyNumber) { checkPayloadReadable(); auto jl = m_payload.jobs(); diff --git a/objectstore/ArchiveRequest.hpp b/objectstore/ArchiveRequest.hpp index bf6a8f72d3ef8bd62a05b2001b0179c53f239d3d..d56d3d0998065c60773f53258c2052eb105fb9c2 100644 --- a/objectstore/ArchiveRequest.hpp +++ b/objectstore/ArchiveRequest.hpp @@ -76,6 +76,19 @@ public: // An job owner updater factory. The owner MUST be previousOwner for the update to be executed. CTA_GENERATE_EXCEPTION_CLASS(WrongPreviousOwner); AsyncJobOwnerUpdater * asyncUpdateJobOwner(uint16_t copyNumber, const std::string & owner, const std::string &previousOwner); + + // An asynchronous job updating class for success. + class AsyncJobSuccessfulUpdater { + friend class ArchiveRequest; + public: + void wait(); + bool m_isLastJob; + private: + std::function<std::string(const std::string &)> m_updaterCallback; + std::unique_ptr<Backend::AsyncUpdater> m_backendUpdater; + }; + AsyncJobSuccessfulUpdater * asyncUpdateJobSuccessful(uint16_t copyNumber); + // Get a job owner std::string getJobOwner(uint16_t copyNumber); // Request management ======================================================== diff --git a/objectstore/Backend.hpp b/objectstore/Backend.hpp index a20bd81e869970164a95b81f13eee4fb7b4d5107..7b9a5ca0db36fc4330d4553fa0175d10cb5cd337 100644 --- a/objectstore/Backend.hpp +++ b/objectstore/Backend.hpp @@ -109,6 +109,7 @@ public: CTA_GENERATE_EXCEPTION_CLASS(CouldNotUpdateValue); CTA_GENERATE_EXCEPTION_CLASS(CouldNotCommit); CTA_GENERATE_EXCEPTION_CLASS(CouldNotUnlock); + CTA_GENERATE_EXCEPTION_CLASS(AsyncUpdateWithDelete); /** * A base class handling asynchronous sequence of lock exclusive, fetch, call user diff --git a/objectstore/BackendRados.cpp b/objectstore/BackendRados.cpp index aa269039d3ef30ec74afad712543f3b4ca579554..18f7a3187da43f0870f2cbbbce06ae46296b4565 100644 --- a/objectstore/BackendRados.cpp +++ b/objectstore/BackendRados.cpp @@ -161,7 +161,7 @@ BackendRados::ScopedLock* BackendRados::lockExclusive(std::string name) { std::string client = createUniqueClientId(); struct timeval tv; tv.tv_usec = 0; - tv.tv_sec = 60; + tv.tv_sec = 240; int rc; std::unique_ptr<ScopedLock> ret(new ScopedLock(m_radosCtx)); // Crude backoff: we will measure the RTT of the call and backoff a faction of this amount multiplied @@ -214,7 +214,7 @@ BackendRados::ScopedLock* BackendRados::lockShared(std::string name) { std::string client = createUniqueClientId(); struct timeval tv; tv.tv_usec = 0; - tv.tv_sec = 60; + tv.tv_sec = 240; int rc; std::unique_ptr<ScopedLock> ret(new ScopedLock(m_radosCtx)); // Crude backoff: we will measure the RTT of the call and backoff a faction of this amount multiplied @@ -254,7 +254,7 @@ BackendRados::ScopedLock* BackendRados::lockShared(std::string name) { cta::exception::Errnum::throwOnReturnedErrno (-m_radosCtx.remove(name), std::string("In ObjectStoreRados::lockShared, failed to librados::IoCtx::remove: ") + name + "//"); - throw cta::exception::Errnum(ENOENT, std::string ("In BackendRados::lockShared(): trying to lock a non-existing object") + name); + throw cta::exception::Errnum(ENOENT, std::string ("In BackendRados::lockShared(): trying to lock a non-existing object: ") + name); } ret->set(name, client); return ret.release(); @@ -278,7 +278,7 @@ BackendRados::AsyncUpdater::AsyncUpdater(BackendRados& be, const std::string& na m_lockClient = BackendRados::createUniqueClientId(); struct timeval tv; tv.tv_usec = 0; - tv.tv_sec = 60; + tv.tv_sec = 240; int rc; // TODO: could be improved (but need aio_lock in rados, not available at the time // of writing). @@ -382,25 +382,48 @@ void BackendRados::AsyncUpdater::fetchCallback(librados::completion_t completion std::string("In In BackendRados::AsyncUpdater::fetchCallback::update_lambda(): failed to read buffer: ") + au.m_name + ": "+ ex.what()); } - // Execute the user's callback. Let exceptions fly through. User knows his own exceptions. - value=au.m_update(value); - try { - // Prepare result in buffer list. - au.m_radosBufferList.clear(); - au.m_radosBufferList.append(value); - } catch (std::exception & ex) { - throw CouldNotUpdateValue( - std::string("In In BackendRados::AsyncUpdater::fetchCallback::update_lambda(): failed to prepare write buffer(): ") + - au.m_name + ": " + ex.what()); + + bool updateWithDelete = false; + try { + // Execute the user's callback. + value=au.m_update(value); + } catch (AsyncUpdateWithDelete & ex) { + updateWithDelete = true; + } catch (...) { + // Let exceptions fly through. User knows his own exceptions. + throw; } - // Launch the write - librados::AioCompletion * aioc = librados::Rados::aio_create_completion(pThis, commitCallback, nullptr); - auto rc=au.m_backend.m_radosCtx.aio_write_full(au.m_name, aioc, au.m_radosBufferList); - aioc->release(); - if (rc) { - cta::exception::Errnum errnum (-rc, - std::string("In BackendRados::AsyncUpdater::fetchCallback::update_lambda(): failed to launch aio_write_full(): ") + au.m_name); - throw Backend::CouldNotCommit(errnum.getMessageValue()); + + if(updateWithDelete) { + try { + au.m_backend.remove(au.m_name); + } catch (cta::exception::Exception &ex) { + throw CouldNotUpdateValue( + std::string("In In BackendRados::AsyncUpdater::fetchCallback::update_lambda(): failed to remove value: ") + + au.m_name + ex.what()); + } + // Done! + ANNOTATE_HAPPENS_BEFORE(&au.m_job); + au.m_job.set_value(); + } else { + try { + // Prepare result in buffer list. + au.m_radosBufferList.clear(); + au.m_radosBufferList.append(value); + } catch (std::exception & ex) { + throw CouldNotUpdateValue( + std::string("In In BackendRados::AsyncUpdater::fetchCallback::update_lambda(): failed to prepare write buffer(): ") + + au.m_name + ex.what()); + } + // Launch the write + librados::AioCompletion * aioc = librados::Rados::aio_create_completion(pThis, commitCallback, nullptr); + auto rc=au.m_backend.m_radosCtx.aio_write_full(au.m_name, aioc, au.m_radosBufferList); + aioc->release(); + if (rc) { + cta::exception::Errnum errnum (-rc, + "In BackendRados::AsyncUpdater::fetchCallback::update_lambda(): failed to launch aio_write_full()" + au.m_name); + throw Backend::CouldNotCommit(errnum.getMessageValue()); + } } } catch (...) { ANNOTATE_HAPPENS_BEFORE(&au.m_job); diff --git a/objectstore/BackendVFS.cpp b/objectstore/BackendVFS.cpp index 3cb0a2c99c6d2f1b991eea3ccb24eb0167249d9b..a8680b6565c35241b3537f2c83e96641b0cdaad2 100644 --- a/objectstore/BackendVFS.cpp +++ b/objectstore/BackendVFS.cpp @@ -329,13 +329,32 @@ BackendVFS::AsyncUpdater::AsyncUpdater(BackendVFS & be, const std::string& name, ANNOTATE_HAPPENS_BEFORE(&m_job); throw Backend::CouldNotFetch(ex.getMessageValue()); } - // Let user's exceptions go through. - std::string postUpdateData=m_update(preUpdateData); - try { - m_backend.atomicOverwrite(m_name, postUpdateData); - } catch (cta::exception::Exception & ex) { - ANNOTATE_HAPPENS_BEFORE(&m_job); - throw Backend::CouldNotCommit(ex.getMessageValue()); + + std::string postUpdateData; + bool updateWithDelete = false; + try { + postUpdateData=m_update(preUpdateData); + } catch (AsyncUpdateWithDelete & ex) { + updateWithDelete = true; + } catch (...) { + // Let user's exceptions go through. + throw; + } + + if(updateWithDelete) { + try { + m_backend.remove(m_name); + } catch (cta::exception::Exception & ex) { + ANNOTATE_HAPPENS_BEFORE(&m_job); + throw Backend::CouldNotCommit(ex.getMessageValue()); + } + } else { + try { + m_backend.atomicOverwrite(m_name, postUpdateData); + } catch (cta::exception::Exception & ex) { + ANNOTATE_HAPPENS_BEFORE(&m_job); + throw Backend::CouldNotCommit(ex.getMessageValue()); + } } try { sl->release(); diff --git a/objectstore/ObjectOps.hpp b/objectstore/ObjectOps.hpp index 727b1abf1ac161cb4ed7ba51236fbfe6737ec4ea..3dbd6b3aef1c945c895cb72dc33097e622bea317 100644 --- a/objectstore/ObjectOps.hpp +++ b/objectstore/ObjectOps.hpp @@ -84,7 +84,7 @@ protected: } void checkReadable() { - if (!m_locksCount) + if (!m_locksCount && !m_noLock) throw NotLocked("In ObjectOps::checkReadable: object not locked"); } @@ -125,6 +125,12 @@ public: m_payloadInterpreted = false; } + void resetValues () { + m_existingObject = false; + m_headerInterpreted = false; + m_payloadInterpreted = false; + } + void setOwner(const std::string & owner) { checkHeaderWritable(); m_header.set_owner(owner); @@ -155,6 +161,7 @@ protected: bool m_existingObject; int m_locksCount; int m_locksForWriteCount; + bool m_noLock = false; }; class ScopedLock { @@ -280,6 +287,15 @@ public: // Check that the object is locked, one way or another if(!m_locksCount) throw NotLocked("In ObjectOps::fetch(): object not locked"); + fetchBottomHalf(); + } + + void fetchNoLock() { + m_noLock = true; + fetchBottomHalf(); + } + + void fetchBottomHalf() { m_existingObject = true; // Get the header from the object store getHeaderFromObjectStore(); diff --git a/objectstore/RetrieveRequest.cpp b/objectstore/RetrieveRequest.cpp index 1c9399d287d63a0505cfa3003eae8564c2e4c561..f8623f377729dd4d36c388d2b34a7a354d3ba43b 100644 --- a/objectstore/RetrieveRequest.cpp +++ b/objectstore/RetrieveRequest.cpp @@ -26,6 +26,7 @@ #include "objectstore/cta.pb.h" #include "Helpers.hpp" #include <google/protobuf/util/json_util.h> +#include <cmath> namespace cta { namespace objectstore { @@ -63,6 +64,7 @@ void RetrieveRequest::initialize() { void RetrieveRequest::garbageCollect(const std::string& presumedOwner, AgentReference & agentReference, log::LogContext & lc, cta::catalogue::Catalogue & catalogue) { checkPayloadWritable(); + utils::Timer t; // Check the request is indeed owned by the right owner. if (getOwner() != presumedOwner) { log::ScopedParamContainer params(lc); @@ -117,6 +119,7 @@ void RetrieveRequest::garbageCollect(const std::string& presumedOwner, AgentRefe throw exception::Exception(err.str()); } tapeFileFound:; + auto tapeSelectionTime = t.secs(utils::Timer::resetCounter); auto bestJob=m_payload.mutable_jobs()->begin(); while (bestJob!=m_payload.mutable_jobs()->end()) { if (bestJob->copynb() == bestTapeFile->copynb()) @@ -140,20 +143,39 @@ jobFound:; mp, m_payload.schedulerrequest().entrylog().time()); auto jobsSummary=rq.getJobsSummary(); rq.commit(); + auto queueUpdateTime = t.secs(utils::Timer::resetCounter); // We can now make the transition official bestJob->set_status(serializers::RetrieveJobStatus::RJS_Pending); m_payload.set_activecopynb(bestJob->copynb()); setOwner(rq.getAddressIfSet()); commit(); + Helpers::updateRetrieveQueueStatisticsCache(bestVid, jobsSummary.files, jobsSummary.bytes, jobsSummary.priority); + rql.release(); + auto commitUnlockQueueTime = t.secs(utils::Timer::resetCounter); + timespec ts; + // We will sleep a bit to make sure other processes can also access the queue + // as we are very likely to be part of a tight loop. + // TODO: ideally, end of session requeueing and garbage collection should be + // done in parallel. + // We sleep half the time it took to queue to give a chance to other lockers. + double secSleep, fracSecSleep; + fracSecSleep = std::modf(queueUpdateTime / 2, &secSleep); + ts.tv_sec = secSleep; + ts.tv_nsec = std::round(fracSecSleep * 1000 * 1000 * 1000); + nanosleep(&ts, nullptr); + auto sleepTime = t.secs(); { log::ScopedParamContainer params(lc); params.add("jobObject", getAddressIfSet()) .add("queueObject", rq.getAddressIfSet()) .add("copynb", bestTapeFile->copynb()) - .add("vid", bestTapeFile->vid()); + .add("vid", bestTapeFile->vid()) + .add("tapeSelectionTime", tapeSelectionTime) + .add("queueUpdateTime", queueUpdateTime) + .add("commitUnlockQueueTime", commitUnlockQueueTime) + .add("sleepTime", sleepTime); lc.log(log::INFO, "In RetrieveRequest::garbageCollect(): requeued the request."); } - Helpers::updateRetrieveQueueStatisticsCache(bestVid, jobsSummary.files, jobsSummary.bytes, jobsSummary.priority); } //------------------------------------------------------------------------------ @@ -337,6 +359,82 @@ serializers::RetrieveJobStatus RetrieveRequest::getJobStatus(uint16_t copyNumber throw exception::Exception(err.str()); } +auto RetrieveRequest::asyncUpdateOwner(uint16_t copyNumber, const std::string& owner, const std::string& previousOwner) + -> AsyncOwnerUpdater* { + std::unique_ptr<AsyncOwnerUpdater> ret(new AsyncOwnerUpdater); + // Passing a reference to the unique pointer led to strange behaviors. + auto & retRef = *ret; + ret->m_updaterCallback= + [this, copyNumber, owner, previousOwner, &retRef](const std::string &in)->std::string { + // We have a locked and fetched object, so we just need to work on its representation. + serializers::ObjectHeader oh; + if (!oh.ParseFromString(in)) { + // Use a the tolerant parser to assess the situation. + oh.ParsePartialFromString(in); + throw cta::exception::Exception(std::string("In RetrieveRequest::asyncUpdateJobOwner(): could not parse header: ")+ + oh.InitializationErrorString()); + } + if (oh.type() != serializers::ObjectType::RetrieveRequest_t) { + std::stringstream err; + err << "In RetrieveRequest::asyncUpdateJobOwner()::lambda(): wrong object type: " << oh.type(); + throw cta::exception::Exception(err.str()); + } + // We don't need to deserialize the payload to update owner... + if (oh.owner() != previousOwner) + throw WrongPreviousOwner("In RetrieveRequest::asyncUpdateJobOwner()::lambda(): Request not owned."); + oh.set_owner(owner); + // ... but we still need to extract information + serializers::RetrieveRequest payload; + if (!payload.ParseFromString(oh.payload())) { + // Use a the tolerant parser to assess the situation. + payload.ParsePartialFromString(oh.payload()); + throw cta::exception::Exception(std::string("In RetrieveRequest::asyncUpdateJobOwner(): could not parse payload: ")+ + payload.InitializationErrorString()); + } + // Find the copy number + auto jl=payload.jobs(); + for (auto & j: jl) { + if (j.copynb() == copyNumber) { + // We also need to gather all the job content for the user to get in-memory + // representation. + // TODO this is an unfortunate duplication of the getXXX() members of ArchiveRequest. + // We could try and refactor this. + retRef.m_retieveRequest.archiveFileID = payload.archivefile().archivefileid(); + objectstore::EntryLogSerDeser el; + el.deserialize(payload.schedulerrequest().entrylog()); + retRef.m_retieveRequest.creationLog = el; + objectstore::DiskFileInfoSerDeser dfi; + dfi.deserialize(payload.schedulerrequest().diskfileinfo()); + retRef.m_retieveRequest.diskFileInfo = dfi; + retRef.m_retieveRequest.dstURL = payload.schedulerrequest().dsturl(); + retRef.m_retieveRequest.requester.name = payload.schedulerrequest().requester().name(); + retRef.m_retieveRequest.requester.group = payload.schedulerrequest().requester().group(); + objectstore::ArchiveFileSerDeser af; + af.deserialize(payload.archivefile()); + retRef.m_archiveFile = af; + oh.set_payload(payload.SerializePartialAsString()); + return oh.SerializeAsString(); + } + } + // If we do not find the copy, return not owned as well... + throw WrongPreviousOwner("In ArchiveRequest::asyncUpdateJobOwner()::lambda(): copyNb not found."); + }; + ret->m_backendUpdater.reset(m_objectStore.asyncUpdate(getAddressIfSet(), ret->m_updaterCallback)); + return ret.release(); + } + +void RetrieveRequest::AsyncOwnerUpdater::wait() { + m_backendUpdater->wait(); +} + +const common::dataStructures::ArchiveFile& RetrieveRequest::AsyncOwnerUpdater::getArchiveFile() { + return m_archiveFile; +} + +const common::dataStructures::RetrieveRequest& RetrieveRequest::AsyncOwnerUpdater::getRetrieveRequest() { + return m_retieveRequest; +} + void RetrieveRequest::setActiveCopyNumber(uint32_t activeCopyNb) { checkPayloadWritable(); m_payload.set_activecopynb(activeCopyNb); diff --git a/objectstore/RetrieveRequest.hpp b/objectstore/RetrieveRequest.hpp index f8c51e1768d3822af295bed14c6e2524a5231ea1..7a1f3f22c6f48bc834cd3ff1a5ad94c397bd05c3 100644 --- a/objectstore/RetrieveRequest.hpp +++ b/objectstore/RetrieveRequest.hpp @@ -71,6 +71,22 @@ public: // Mark all the jobs as pending deletion from NS. void setAllJobsPendingNSdeletion(); CTA_GENERATE_EXCEPTION_CLASS(NoSuchJob); + // An asynchronous job ownership updating class. + class AsyncOwnerUpdater { + friend class RetrieveRequest; + public: + void wait(); + const common::dataStructures::RetrieveRequest & getRetrieveRequest(); + const common::dataStructures::ArchiveFile & getArchiveFile(); + private: + std::function<std::string(const std::string &)> m_updaterCallback; + std::unique_ptr<Backend::AsyncUpdater> m_backendUpdater; + common::dataStructures::RetrieveRequest m_retieveRequest; + common::dataStructures::ArchiveFile m_archiveFile; + }; + // An owner updater factory. The owner MUST be previousOwner for the update to be executed. + CTA_GENERATE_EXCEPTION_CLASS(WrongPreviousOwner); + AsyncOwnerUpdater * asyncUpdateOwner(uint16_t copyNumber, const std::string & owner, const std::string &previousOwner); // Request management ======================================================== void setSuccessful(); void setFailed(); diff --git a/objectstore/RootEntry.cpp b/objectstore/RootEntry.cpp index 011bafbb7744b48e99c3957d90be8a84880c1b8d..bbf006e201fe7a60c2f6aeaa2c29818b18afa2a2 100644 --- a/objectstore/RootEntry.cpp +++ b/objectstore/RootEntry.cpp @@ -166,7 +166,7 @@ void RootEntry::removeArchiveQueueAndCommit(const std::string& tapePool) { } // Check the archive queue is empty if (!aq.isEmpty()) { - throw ArchivelQueueNotEmpty ("In RootEntry::removeArchiveQueueAndCommit(): trying to " + throw ArchiveQueueNotEmpty ("In RootEntry::removeArchiveQueueAndCommit(): trying to " "remove a non-empty archive queue"); } // We can delete the queue diff --git a/objectstore/RootEntry.hpp b/objectstore/RootEntry.hpp index 530471d770f5ddad750275262c3100a2a50ae23f..86deabaf82423ad04c844de00a5c3e15d0fbacfb 100644 --- a/objectstore/RootEntry.hpp +++ b/objectstore/RootEntry.hpp @@ -59,7 +59,7 @@ public: cta::catalogue::Catalogue & catalogue) override; // ArchiveQueue handling ==================================================== - CTA_GENERATE_EXCEPTION_CLASS(ArchivelQueueNotEmpty); + CTA_GENERATE_EXCEPTION_CLASS(ArchiveQueueNotEmpty); CTA_GENERATE_EXCEPTION_CLASS(WrongArchiveQueue); /** This function implicitly creates the archive queue structure and updates * the pointer to it. It will implicitly commit the object to the store. */ diff --git a/objectstore/cta-objectstore-unfollow-agent.cpp b/objectstore/cta-objectstore-unfollow-agent.cpp index e44665da7847d8ea3bc00245c9499c7e3f1c6221..1ab5273a891b6ccde4c1877f6c19d4296f73bcd1 100644 --- a/objectstore/cta-objectstore-unfollow-agent.cpp +++ b/objectstore/cta-objectstore-unfollow-agent.cpp @@ -46,6 +46,20 @@ int main(int argc, char ** argv) { } catch (std::bad_cast &){} std::cout /* << "Object store path: " << be->getParams()->toURL() << " agent */<< "name=" << argv[2] << std::endl; + if (!be->exists(argv[2])) { + // Agent does not exist: remove from registry. + cta::objectstore::RootEntry re (*be); + cta::objectstore::ScopedSharedLock rel(re); + re.fetch(); + cta::objectstore::AgentRegister ar(re.getAgentRegisterAddress(), *be); + rel.release(); + cta::objectstore::ScopedExclusiveLock arl(ar); + ar.fetch(); + ar.removeAgent(argv[2]); + ar.commit(); + std::cout << "De-listed a non-existing agent." << std::endl; + exit (EXIT_SUCCESS); + } cta::objectstore::Agent ag(argv[2], *be); cta::objectstore::ScopedExclusiveLock agl(ag); try { diff --git a/scheduler/ArchiveJob.cpp b/scheduler/ArchiveJob.cpp index 10d9905f99a84db148d1769d88b3603be349b83e..de37ff600db33e566f1e239887d37165f73eb88d 100644 --- a/scheduler/ArchiveJob.cpp +++ b/scheduler/ArchiveJob.cpp @@ -41,28 +41,28 @@ cta::ArchiveJob::ArchiveJob(ArchiveMount &mount, tapeFile(tapeFile) {} //------------------------------------------------------------------------------ -// complete +// asyncSetJobSucceed //------------------------------------------------------------------------------ -bool cta::ArchiveJob::complete() { - // First check that the block Id for the file has been set. - if (tapeFile.blockId == - std::numeric_limits<decltype(tapeFile.blockId)>::max()) - throw BlockIdNotSet("In cta::ArchiveJob::complete(): Block ID not set"); - // Also check the checksum has been set - if (archiveFile.checksumType.empty() || archiveFile.checksumValue.empty() || - tapeFile.checksumType.empty() || tapeFile.checksumValue.empty()) - throw ChecksumNotSet("In cta::ArchiveJob::complete(): checksums not set"); - // And matches - if (archiveFile.checksumType != tapeFile.checksumType || - archiveFile.checksumValue != tapeFile.checksumValue) - throw ChecksumMismatch(std::string("In cta::ArchiveJob::complete(): checksum mismatch!") - +" Archive file checksum type: "+archiveFile.checksumType - +" Archive file checksum value: "+archiveFile.checksumValue - +" Tape file checksum type: "+tapeFile.checksumType - +" Tape file checksum value: "+tapeFile.checksumValue); - // We are good to go to record the data in the persistent storage. - // Record the data in the archiveNS. The checksum will be validated if already - // present, of inserted if not. +void cta::ArchiveJob::asyncSetJobSucceed() { + m_dbJob->asyncSucceed(); +} + +//------------------------------------------------------------------------------ +// checkAndReportComplete +//------------------------------------------------------------------------------ +bool cta::ArchiveJob::checkAndAsyncReportComplete() { + if (m_dbJob->checkSucceed()) { + std::unique_ptr<eos::DiskReporter> reporter(m_mount.createDiskReporter(m_dbJob->archiveReportURL, m_reporterState)); + reporter->asyncReportArchiveFullyComplete(); + return true; + } + return false; +} + +//------------------------------------------------------------------------------ +// ArchiveJob::writeToCatalogue +//------------------------------------------------------------------------------ +void cta::ArchiveJob::writeToCatalogue() { catalogue::TapeFileWritten fileReport; fileReport.archiveFileId = archiveFile.archiveFileID; fileReport.blockId = tapeFile.blockId; @@ -82,18 +82,53 @@ bool cta::ArchiveJob::complete() { fileReport.tapeDrive = m_mount.getDrive(); fileReport.vid = tapeFile.vid; m_catalogue.filesWrittenToTape (std::set<catalogue::TapeFileWritten>{fileReport}); - //m_ns.addTapeFile(SecurityIdentity(UserIdentity(std::numeric_limits<uint32_t>::max(), - // std::numeric_limits<uint32_t>::max()), ""), archiveFile.fileId, nameServerTapeFile); - // We will now report the successful archival to the EOS instance. - // if TODO TODO - // We can now record the success for the job in the database. - // If this is the last job of the request, we also report the success to the client. - if (m_dbJob->succeed()) { - std::unique_ptr<eos::DiskReporter> reporter(m_mount.createDiskReporter(m_dbJob->archiveReportURL)); - reporter->reportArchiveFullyComplete(); - return true; - } - return false; +} +//------------------------------------------------------------------------------ +// ArchiveJob::validateAndGetTapeFileWritten +//------------------------------------------------------------------------------ +cta::catalogue::TapeFileWritten cta::ArchiveJob::validateAndGetTapeFileWritten() { + validate(); + catalogue::TapeFileWritten fileReport; + fileReport.archiveFileId = archiveFile.archiveFileID; + fileReport.blockId = tapeFile.blockId; + fileReport.checksumType = tapeFile.checksumType; + fileReport.checksumValue = tapeFile.checksumValue; + fileReport.compressedSize = tapeFile.compressedSize; + fileReport.copyNb = tapeFile.copyNb; + fileReport.diskFileId = archiveFile.diskFileId; + fileReport.diskFileUser = archiveFile.diskFileInfo.owner; + fileReport.diskFileGroup = archiveFile.diskFileInfo.group; + fileReport.diskFilePath = archiveFile.diskFileInfo.path; + fileReport.diskFileRecoveryBlob = archiveFile.diskFileInfo.recoveryBlob; + fileReport.diskInstance = archiveFile.diskInstance; + fileReport.fSeq = tapeFile.fSeq; + fileReport.size = archiveFile.fileSize; + fileReport.storageClassName = archiveFile.storageClass; + fileReport.tapeDrive = m_mount.getDrive(); + fileReport.vid = tapeFile.vid; + return fileReport; +} + +//------------------------------------------------------------------------------ +// ArchiveJob::validate +//------------------------------------------------------------------------------ +void cta::ArchiveJob::validate(){ + // First check that the block Id for the file has been set. + if (tapeFile.blockId == + std::numeric_limits<decltype(tapeFile.blockId)>::max()) + throw BlockIdNotSet("In cta::ArchiveJob::validate(): Block ID not set"); + // Also check the checksum has been set + if (archiveFile.checksumType.empty() || archiveFile.checksumValue.empty() || + tapeFile.checksumType.empty() || tapeFile.checksumValue.empty()) + throw ChecksumNotSet("In cta::ArchiveJob::validate(): checksums not set"); + // And matches + if (archiveFile.checksumType != tapeFile.checksumType || + archiveFile.checksumValue != tapeFile.checksumValue) + throw ChecksumMismatch(std::string("In cta::ArchiveJob::validate(): checksum mismatch!") + +" Archive file checksum type: "+archiveFile.checksumType + +" Archive file checksum value: "+archiveFile.checksumValue + +" Tape file checksum type: "+tapeFile.checksumType + +" Tape file checksum value: "+tapeFile.checksumValue); } //------------------------------------------------------------------------------ @@ -109,3 +144,10 @@ std::string cta::ArchiveJob::reportURL() { void cta::ArchiveJob::failed(const cta::exception::Exception &ex, log::LogContext & lc) { m_dbJob->fail(lc); } + +//------------------------------------------------------------------------------ +// waitForReporting +//------------------------------------------------------------------------------ +void cta::ArchiveJob::waitForReporting() { + m_reporterState.get_future().get(); +} diff --git a/scheduler/ArchiveJob.hpp b/scheduler/ArchiveJob.hpp index c042fe42f92f488d5949a624f0148541d65aac49..d94afd831da7e09e2ab34307928e344036c782fa 100644 --- a/scheduler/ArchiveJob.hpp +++ b/scheduler/ArchiveJob.hpp @@ -25,6 +25,7 @@ #include <stdint.h> #include <string> +#include <future> namespace cta { @@ -68,11 +69,37 @@ public: CTA_GENERATE_EXCEPTION_CLASS(BlockIdNotSet); CTA_GENERATE_EXCEPTION_CLASS(ChecksumNotSet); CTA_GENERATE_EXCEPTION_CLASS(ChecksumMismatch); + + /** + * Indicates that the job was successful and updates the backend store + * asynchronously. + */ + virtual void asyncSetJobSucceed(); + + /** + * Wait if the job was updated in the backend store asynchronously. + * @return true if the archive was also sent to client asynchronously. + */ + virtual bool checkAndAsyncReportComplete(); + + /** + * Validate that archiveFile and tapeFile fields are set correctly for archive + * request. + * Throw appropriate exception if there is any problem. + */ + virtual void validate(); + + /** + * Update the catalog with the archive request. + */ + virtual void writeToCatalogue(); + /** - * Indicates that the job was successful and updates the backend store - * @return true if the archive was also reported to client. + * Validate that archiveFile and tapeFile fields are set correctly for archive + * request. + * @return The tapeFileWritten event for the catalog update. */ - virtual bool complete(); + virtual catalogue::TapeFileWritten validateAndGetTapeFileWritten(); /** * Triggers a scheduler update following the failure of the job. @@ -99,13 +126,19 @@ private: * The mount that generated this job */ ArchiveMount &m_mount; - + /** * Reference to the name server */ catalogue::Catalogue &m_catalogue; -public: + /** + * State for the asynchronous report to the client. + */ + std::promise<void> m_reporterState; + +public: + CTA_GENERATE_EXCEPTION_CLASS(NotImplemented); /** @@ -122,6 +155,11 @@ public: * The file archive result for the NS */ common::dataStructures::TapeFile tapeFile; + + /** + * Wait for the reporterState is set by the reporting thread. + */ + virtual void waitForReporting(); }; // class ArchiveJob diff --git a/scheduler/ArchiveMount.cpp b/scheduler/ArchiveMount.cpp index e3e429ec2ba25d34750fa117fb8f71456e63d19a..32e7a9dfe9ad33306a3330a1f87d7eecdb1b6311 100644 --- a/scheduler/ArchiveMount.cpp +++ b/scheduler/ArchiveMount.cpp @@ -77,8 +77,8 @@ uint32_t cta::ArchiveMount::getNbFiles() const { //------------------------------------------------------------------------------ // createDiskReporter //------------------------------------------------------------------------------ -cta::eos::DiskReporter* cta::ArchiveMount::createDiskReporter(std::string& URL) { - return m_reporterFactory.createDiskReporter(URL); +cta::eos::DiskReporter* cta::ArchiveMount::createDiskReporter(std::string& URL, std::promise<void> &reporterState) { + return m_reporterFactory.createDiskReporter(URL, reporterState); } //------------------------------------------------------------------------------ @@ -92,6 +92,13 @@ std::string cta::ArchiveMount::getMountTransactionId() const { return id.str(); } +//------------------------------------------------------------------------------ +// updateCatalogueWithTapeFilesWritten +//------------------------------------------------------------------------------ +void cta::ArchiveMount::updateCatalogueWithTapeFilesWritten(const std::set<cta::catalogue::TapeFileWritten> &tapeFilesWritten) { + m_catalogue.filesWrittenToTape(tapeFilesWritten); +} + //------------------------------------------------------------------------------ // getNextJobBatch //------------------------------------------------------------------------------ diff --git a/scheduler/ArchiveMount.hpp b/scheduler/ArchiveMount.hpp index 9f9bde38c29979f185421b16a27615aeb6f0d4a3..f6584249d153294ded5d2d712535c731167071f0 100644 --- a/scheduler/ArchiveMount.hpp +++ b/scheduler/ArchiveMount.hpp @@ -139,9 +139,17 @@ namespace cta { /** * Creates a disk reporter for the ArchiveJob (this is a wrapper). * @param URL: report address - * @return poitner to the reporter created. + * @param reporterState void promise to be set when the report is done asynchronously. + * @return pointer to the reporter created. */ - eos::DiskReporter * createDiskReporter(std::string & URL); + eos::DiskReporter * createDiskReporter(std::string & URL, std::promise<void> &reporterState); + + /** + * Update the catalog with a set of TapeFileWritten events. + * + * @param tapeFilesWritten The set of report events for the catalog update. + */ + void updateCatalogueWithTapeFilesWritten(const std::set<cta::catalogue::TapeFileWritten> &tapeFilesWritten); /** * Destructor. @@ -159,7 +167,7 @@ namespace cta { * A reference to the file catalogue. */ catalogue::Catalogue & m_catalogue; - + /** * Internal tracking of the session completion */ diff --git a/scheduler/OStoreDB/OStoreDB.cpp b/scheduler/OStoreDB/OStoreDB.cpp index 3ccb490701754f06c83b2d63e0bf60a7ffd315f4..4437c1427f1dae3ff6d55f60ac97d8da11bb3f1a 100644 --- a/scheduler/OStoreDB/OStoreDB.cpp +++ b/scheduler/OStoreDB/OStoreDB.cpp @@ -84,14 +84,31 @@ void OStoreDB::ping() { //------------------------------------------------------------------------------ // OStoreDB::fetchMountInfo() //------------------------------------------------------------------------------ -void OStoreDB::fetchMountInfo(SchedulerDatabase::TapeMountDecisionInfo& tmdi, RootEntry& re) { +void OStoreDB::fetchMountInfo(SchedulerDatabase::TapeMountDecisionInfo& tmdi, RootEntry& re, + FetchFlavour fetchFlavour) { // Walk the archive queues for statistics for (auto & aqp: re.dumpArchiveQueues()) { objectstore::ArchiveQueue aqueue(aqp.address, m_objectStore); // debug utility variable std::string __attribute__((__unused__)) poolName = aqp.tapePool; - objectstore::ScopedSharedLock aqlock(aqueue); - aqueue.fetch(); + objectstore::ScopedSharedLock aqlock; + try { + if (fetchFlavour == FetchFlavour::lock) { + aqlock.lock(aqueue); + aqueue.fetch(); + } else { + aqueue.fetchNoLock(); + } + } catch (cta::exception::Exception &ex) { + log::LogContext lc(m_logger); + log::ScopedParamContainer params (lc); + params.add("queueObject", aqp.address) + .add("tapepool", aqp.tapePool) + .add("exceptionMessage", ex.getMessageValue()) + .add("fetchFlavour", (fetchFlavour == FetchFlavour::lock)?"lock":"noLock"); + lc.log(log::WARNING, "In OStoreDB::fetchMountInfo(): failed to lock/fetch an archive queue. Skipping it."); + continue; + } // If there are files queued, we create an entry for this tape pool in the // mount candidates list. if (aqueue.getJobsSummary().files) { @@ -115,8 +132,19 @@ void OStoreDB::fetchMountInfo(SchedulerDatabase::TapeMountDecisionInfo& tmdi, Ro RetrieveQueue rqueue(rqp.address, m_objectStore); // debug utility variable std::string __attribute__((__unused__)) vid = rqp.vid; - ScopedSharedLock rqlock(rqueue); - rqueue.fetch(); + ScopedSharedLock rqlock; + try { + rqlock.lock(rqueue); + rqueue.fetch(); + } catch (cta::exception::Exception &ex) { + log::LogContext lc(m_logger); + log::ScopedParamContainer params (lc); + params.add("queueObject", rqp.address) + .add("vid", rqp.vid) + .add("exceptionMessage", ex.getMessageValue()); + lc.log(log::WARNING, "In OStoreDB::fetchMountInfo(): failed to lock/fetch a retrieve queue. Skipping it."); + continue; + } // If there are files queued, we create an entry for this retrieve queue in the // mount candidates list. if (rqueue.getJobsSummary().files) { @@ -264,9 +292,9 @@ std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo> OStoreDB::getMountInfo // Get all the tape pools and tapes with queues (potential mounts) objectstore::RootEntry re(m_objectStore); objectstore::ScopedSharedLock rel(re); - re.fetch(); + re.fetchNoLock(); TapeMountDecisionInfoNoLock & tmdi=*privateRet; - fetchMountInfo(tmdi, re); + fetchMountInfo(tmdi, re, FetchFlavour::noLock); std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo> ret(std::move(privateRet)); return ret; } @@ -939,12 +967,10 @@ std::map<std::string, std::list<common::dataStructures::RetrieveJob> > OStoreDB: //------------------------------------------------------------------------------ std::list<cta::common::dataStructures::DriveState> OStoreDB::getDriveStates() const { RootEntry re(m_objectStore); - ScopedSharedLock rel(re); - re.fetch(); + re.fetchNoLock(); auto driveRegisterAddress = re.getDriveRegisterAddress(); objectstore::DriveRegister dr(driveRegisterAddress, m_objectStore); - objectstore::ScopedExclusiveLock drl(dr); - dr.fetch(); + dr.fetchNoLock(); return dr.getAllDrivesState(); } @@ -1478,7 +1504,7 @@ std::unique_ptr<SchedulerDatabase::RetrieveMount> // latest known state of the drive (and its absence of updating if needed) // Prepare the return value std::unique_ptr<OStoreDB::RetrieveMount> privateRet( - new OStoreDB::RetrieveMount(m_objectStore, m_agentReference, m_catalogue)); + new OStoreDB::RetrieveMount(m_objectStore, m_agentReference, m_catalogue, m_logger)); auto &rm = *privateRet; // Check we hold the scheduling lock if (!m_lockTaken) @@ -1560,13 +1586,13 @@ const SchedulerDatabase::ArchiveMount::MountInfo& OStoreDB::ArchiveMount::getMou std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMount::getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested, log::LogContext& logContext) { // Find the next files to archive - // Get the archive queue - objectstore::RootEntry re(m_objectStore); - objectstore::ScopedSharedLock rel(re); - re.fetch(); // First, check we should not forcibly go down. In such an occasion, we just find noting to do. // Get drive register { + // Get the archive queue + objectstore::RootEntry re(m_objectStore); + objectstore::ScopedSharedLock rel(re); + re.fetch(); objectstore::DriveRegister dr(re.getDriveRegisterAddress(), m_objectStore); ScopedSharedLock drl(dr); dr.fetch(); @@ -1576,62 +1602,69 @@ std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMoun return std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> >(); } } - auto aql = re.dumpArchiveQueues(); - rel.release(); - std::string aqAddress; - for (auto & aqp : aql) { + // Now, we should repeatedly fetch jobs from the queue until we fulfilled the request or there is nothing to get form the + // queue anymore. + // Requests are left as-is on errors. We will keep a list of them to avoid re-accessing them in the same batch. + std::set<std::string> archiveRequestsToSkip; + // Prepare the returned jobs that we might accumulate in several rounds. + std::list<std::unique_ptr<OStoreDB::ArchiveJob>> privateRet; + uint64_t currentBytes=0; + uint64_t currentFiles=0; + size_t iterationCount=0; + while (true) { + iterationCount++; + uint64_t beforeBytes=currentBytes; + uint64_t beforeFiles=currentFiles; + // Try and get access to a queue. + objectstore::RootEntry re(m_objectStore); + objectstore::ScopedSharedLock rel(re); + re.fetch(); + std::string aqAddress; + auto aql = re.dumpArchiveQueues(); + for (auto & aqp : aql) { if (aqp.tapePool == mountInfo.tapePool) aqAddress = aqp.address; - } - // The archive queue is gone, there is no more job - if (!aqAddress.size()) - return std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> >(); - // Try and open the archive queue. It could be gone by now. - try { + } + if (!aqAddress.size()) break; + // try and lock the archive queue. Any failure from here on means the end of the getting jobs. objectstore::ArchiveQueue aq(aqAddress, m_objectStore); objectstore::ScopedExclusiveLock aqlock; try { - aqlock.lock(aq); - aq.fetch(); - } catch (cta::exception::Exception & ex) { - // The queue is now absent. We can remove its reference in the root entry. - // A new queue could have been added in the mean time, and be non-empty. - // We will then fail to remove from the RootEntry (non-fatal). - // TODO: We still conclude that the queue is empty on this unlikely event. - // (cont'd): A better approach would be to restart the process of this function - // from scratch. - rel.lock(re); - re.fetch(); try { - re.removeArchiveQueueAndCommit(mountInfo.tapePool); - log::ScopedParamContainer params(logContext); - params.add("tapepool", mountInfo.tapePool) - .add("queueObject", aq.getAddressIfSet()); - logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): de-referenced missing queue from root entry"); - } catch (RootEntry::ArchivelQueueNotEmpty & ex) { - // TODO: improve: if we fail here we could retry to fetch a job. + aqlock.lock(aq); + rel.release(); + aq.fetch(); + } catch (cta::exception::Exception & ex) { + // The queue is now absent. We can remove its reference in the root entry. + // A new queue could have been added in the mean time, and be non-empty. + // We will then fail to remove from the RootEntry (non-fatal). + if (rel.isLocked()) rel.release(); + ScopedExclusiveLock rexl(re); + re.fetch(); + try { + re.removeArchiveQueueAndCommit(mountInfo.tapePool); + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", aq.getAddressIfSet()); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): de-referenced missing queue from root entry"); + } catch (RootEntry::ArchiveQueueNotEmpty & ex) { + // TODO: improve: if we fail here we could retry to fetch a job. + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", aq.getAddressIfSet()) + .add("Message", ex.getMessageValue()); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): could not de-referenced missing queue from root entry"); + } + continue; + } + // We now have the queue. + { log::ScopedParamContainer params(logContext); params.add("tapepool", mountInfo.tapePool) .add("queueObject", aq.getAddressIfSet()) - .add("Message", ex.getMessageValue()); - logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): could not de-referenced missing queue from root entry"); + .add("queueSize", aq.dumpJobs().size()); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): archive queue found."); } - return std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> >(); - } - // Pop jobs by batch until we find a sufficient number actually belonging to the queue. - // Any job not really belonging is an uncommitted pop, which we will - // re-do here. - std::list<std::unique_ptr<OStoreDB::ArchiveJob>> privateRet; - uint64_t currentBytes=0; - uint64_t currentFiles=0; - { - log::ScopedParamContainer params(logContext); - params.add("tapepool", mountInfo.tapePool) - .add("queueObject", aq.getAddressIfSet()) - .add("queueSize", aq.dumpJobs().size()); - logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): archive queue found."); - } - while (aq.dumpJobs().size()) { // We should build the list of jobs we intend to grab. We will attempt to // dequeue them in one go, updating jobs in parallel. If some jobs turn out // to not be there really, we will have to do several passes. @@ -1639,14 +1672,18 @@ std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMoun auto candidateDumps=aq.dumpJobs(); std::list<std::unique_ptr<OStoreDB::ArchiveJob>> candidateJobs; // If we fail to find jobs in one round, we will exit. - size_t jobsInThisRound=0; while (candidateDumps.size() && currentBytes < bytesRequested && currentFiles < filesRequested) { auto job=candidateDumps.front(); candidateDumps.pop_front(); - currentFiles++; - currentBytes+=job.size; - candidateJobs.emplace_back(new OStoreDB::ArchiveJob(job.address, m_objectStore, m_catalogue, m_logger, m_agentReference, *this)); - candidateJobs.back()->tapeFile.copyNb = job.copyNb; + // If we saw an archive request we could not pop nor cleanup (really bad case), we + // will disregard it for the rest of this getNextJobBatch call. We will re-consider + // in the next call. + if (!archiveRequestsToSkip.count(job.address)) { + currentFiles++; + currentBytes+=job.size; + candidateJobs.emplace_back(new OStoreDB::ArchiveJob(job.address, m_objectStore, m_catalogue, m_logger, m_agentReference, *this)); + candidateJobs.back()->tapeFile.copyNb = job.copyNb; + } } { log::ScopedParamContainer params(logContext); @@ -1670,7 +1707,7 @@ std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMoun std::list<std::unique_ptr<objectstore::ArchiveRequest::AsyncJobOwnerUpdater>> jobUpdates; for (const auto &j: candidateJobs) jobUpdates.emplace_back( j->m_archiveRequest.asyncUpdateJobOwner(j->tapeFile.copyNb, m_agentReference.getAgentAddress(), aqAddress)); - // Now run through the results of the asynchronous updates. Non-sucess results come in the form of exceptions. + // Now run through the results of the asynchronous updates. Non-success results come in the form of exceptions. std::list<std::string> jobsToForget; // The jobs either absent or not owned, for which we should just remove references (agent). std::list<std::string> jobsToDequeue; // The jobs that should not be queued anymore. All of them indeed (invalid or successfully poped). std::list<std::unique_ptr<OStoreDB::ArchiveJob>> validatedJobs; // The jobs we successfully validated. @@ -1696,11 +1733,10 @@ std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMoun log::ScopedParamContainer params(logContext); params.add("tapepool", mountInfo.tapePool) .add("queueObject", aq.getAddressIfSet()) - .add("archiveRequest", (*j)->m_archiveRequest.getAddressIfSet()) + .add("requestObject", (*j)->m_archiveRequest.getAddressIfSet()) .add("fileId", (*j)->archiveFile.archiveFileID); logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): popped one job"); validatedJobs.emplace_back(std::move(*j)); - jobsInThisRound++; } catch (cta::exception::Exception & e) { std::string debugType=typeid(e).name(); if (typeid(e) == typeid(Backend::NoSuchObject) || @@ -1712,8 +1748,28 @@ std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMoun log::ScopedParamContainer params(logContext); params.add("tapepool", mountInfo.tapePool) .add("queueObject", aq.getAddressIfSet()) - .add("archiveRequest", (*j)->m_archiveRequest.getAddressIfSet()); + .add("requestObject", (*j)->m_archiveRequest.getAddressIfSet()); logContext.log(log::WARNING, "In ArchiveMount::getNextJobBatch(): skipped job not owned or not present."); + } else if (typeid(e) == typeid(Backend::CouldNotUnlock)) { + // We failed to unlock the object. The request was successfully updated, so we do own it. This is a non-fatal + // situation, so we just issue a warning. Removing the request from our agent ownership would + // orphan it. + log::ScopedParamContainer params(logContext); + int demangleStatus; + char * exceptionTypeStr = abi::__cxa_demangle(typeid(e).name(), nullptr, nullptr, &demangleStatus); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", aq.getAddressIfSet()) + .add("requestObject", (*j)->m_archiveRequest.getAddressIfSet()); + if (!demangleStatus) { + params.add("exceptionType", exceptionTypeStr); + } else { + params.add("exceptionType", typeid(e).name()); + } + free(exceptionTypeStr); + exceptionTypeStr = nullptr; + params.add("message", e.getMessageValue()); + logContext.log(log::WARNING, "In ArchiveMount::getNextJobBatch(): Failed to unlock the request (lock expiration). Request remains selected."); + validatedJobs.emplace_back(std::move(*j)); } else { // This is not a success, yet we could not confirm the job status due to an unexpected error. // We leave the queue as is. We forget about owning this job. This is an error. @@ -1722,7 +1778,7 @@ std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMoun char * exceptionTypeStr = abi::__cxa_demangle(typeid(e).name(), nullptr, nullptr, &demangleStatus); params.add("tapepool", mountInfo.tapePool) .add("queueObject", aq.getAddressIfSet()) - .add("archiveRequest", (*j)->m_archiveRequest.getAddressIfSet()); + .add("requestObject", (*j)->m_archiveRequest.getAddressIfSet()); if (!demangleStatus) { params.add("exceptionType", exceptionTypeStr); } else { @@ -1733,6 +1789,7 @@ std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMoun params.add("message", e.getMessageValue()); logContext.log(log::ERR, "In ArchiveMount::getNextJobBatch(): unexpected error. Leaving the job queued."); jobsToForget.emplace_back((*j)->m_archiveRequest.getAddressIfSet()); + archiveRequestsToSkip.insert((*j)->m_archiveRequest.getAddressIfSet()); } // This job is not for us. jobsToForget.emplace_back((*j)->m_archiveRequest.getAddressIfSet()); @@ -1755,60 +1812,83 @@ std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > OStoreDB::ArchiveMoun privateRet.emplace_back(std::move(*vj)); vj=validatedJobs.erase(vj); } - // We could be done now. - if (currentBytes >= bytesRequested || currentFiles >= filesRequested) - break; - // If this round was not fruitful, just break. - if (!jobsInThisRound) - break; - } - // We either ran out of jobs or fulfilled the requirements. Time to build up the reply. - // Before this, we can release the queue and delete it if we emptied it. - auto remainingJobs=aq.dumpJobs().size(); - aqlock.release(); - // If the queue is empty, we can get rid of it. - if (!remainingJobs) { - try { - // The queue should be removed as it is empty. - ScopedExclusiveLock reel(re); - re.fetch(); - re.removeArchiveQueueAndCommit(mountInfo.tapePool); - log::ScopedParamContainer params(logContext); - params.add("tapepool", mountInfo.tapePool) - .add("queueObject", aq.getAddressIfSet()); - logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): deleted empty queue"); - } catch (cta::exception::Exception &ex) { + // Before going for another round, we can release the queue and delete it if we emptied it. + auto remainingJobs=aq.dumpJobs().size(); + aqlock.release(); + // If the queue is empty, we can get rid of it. + if (!remainingJobs) { + try { + // The queue should be removed as it is empty. + ScopedExclusiveLock rexl(re); + re.fetch(); + re.removeArchiveQueueAndCommit(mountInfo.tapePool); + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", aq.getAddressIfSet()); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): deleted empty queue"); + } catch (cta::exception::Exception &ex) { + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", aq.getAddressIfSet()) + .add("Message", ex.getMessageValue()); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): could not delete a presumably empty queue"); + } + } + // We can now summarize this round + { log::ScopedParamContainer params(logContext); params.add("tapepool", mountInfo.tapePool) .add("queueObject", aq.getAddressIfSet()) - .add("Message", ex.getMessageValue()); - logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): could not delete a presumably empty queue"); + .add("filesAdded", currentFiles - beforeFiles) + .add("bytesAdded", currentBytes - beforeBytes) + .add("filesBefore", beforeFiles) + .add("bytesBefore", beforeBytes) + .add("filesAfter", currentFiles) + .add("bytesAfter", currentBytes) + .add("iterationCount", iterationCount); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): did one round of jobs retrieval."); } + // We could be done now. + if (currentBytes >= bytesRequested || currentFiles >= filesRequested) + break; + // If we had exhausted the queue while selecting the jobs, we stop here, else we can go for another + // round. + if (!candidateDumps.size()) + break; + } catch (cta::exception::Exception & ex) { + log::ScopedParamContainer params (logContext); + params.add("exceptionMessage", ex.getMessageValue()); + logContext.log(log::ERR, "In OStoreDB::ArchiveMount::getNextJobBatch(): error (CTA exception) getting more jobs. Backtrace follows."); + logContext.logBacktrace(log::ERR, ex.backtrace()); + break; + } catch (std::exception & e) { + log::ScopedParamContainer params (logContext); + params.add("exceptionWhat", e.what()); + logContext.log(log::ERR, "In OStoreDB::ArchiveMount::getNextJobBatch(): error (std exception) getting more jobs."); + break; + } catch (...) { + logContext.log(log::ERR, "In OStoreDB::ArchiveMount::getNextJobBatch(): error (unknown exception) getting more jobs."); + break; } - // Log the outcome. - uint64_t nFiles=privateRet.size(); - uint64_t nBytes=0; - for (auto & j: privateRet) { - nBytes+=j->archiveFile.fileSize; - } - { - log::ScopedParamContainer params(logContext); - params.add("tapepool", mountInfo.tapePool) - .add("files", nFiles) - .add("bytes", nBytes); - logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): jobs retrieval complete."); - } - // We can construct the return value. - std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > ret; - for (auto & j: privateRet) ret.emplace_back(std::move(j)); - return ret; - } catch (cta::exception::Exception & ex) { + } + // We either ran out of jobs or fulfilled the requirements. Time to build up the reply. + // Log the outcome. + uint64_t nFiles=privateRet.size(); + uint64_t nBytes=0; + for (auto & j: privateRet) { + nBytes+=j->archiveFile.fileSize; + } + { log::ScopedParamContainer params(logContext); params.add("tapepool", mountInfo.tapePool) - .add("message", ex.getMessageValue()); - logContext.log(log::ERR, "In ArchiveMount::getNextJobBatch(): got exception. Returning empty"); - return std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> >(); + .add("files", nFiles) + .add("bytes", nBytes); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): jobs retrieval complete."); } + // We can construct the return value. + std::list<std::unique_ptr<SchedulerDatabase::ArchiveJob> > ret; + for (auto & j: privateRet) ret.emplace_back(std::move(j)); + return ret; } //------------------------------------------------------------------------------ @@ -1850,8 +1930,8 @@ OStoreDB::ArchiveJob::ArchiveJob(const std::string& jobAddress, //------------------------------------------------------------------------------ // OStoreDB::RetrieveMount::RetrieveMount() //------------------------------------------------------------------------------ -OStoreDB::RetrieveMount::RetrieveMount(objectstore::Backend& os, objectstore::AgentReference& ar, catalogue::Catalogue & c): - m_objectStore(os), m_agentReference(ar), m_catalogue(c) { } +OStoreDB::RetrieveMount::RetrieveMount(objectstore::Backend& os, objectstore::AgentReference& ar, catalogue::Catalogue & c, log::Logger & l): + m_objectStore(os), m_catalogue(c), m_logger(l), m_agentReference(ar) {} //------------------------------------------------------------------------------ // OStoreDB::RetrieveMount::getMountInfo() @@ -1861,116 +1941,308 @@ const OStoreDB::RetrieveMount::MountInfo& OStoreDB::RetrieveMount::getMountInfo( } //------------------------------------------------------------------------------ -// OStoreDB::RetrieveMount::getNextJob() +// OStoreDB::RetrieveMount::getNextJobBatch() //------------------------------------------------------------------------------ -auto OStoreDB::RetrieveMount::getNextJob(log::LogContext & logContext) -> std::unique_ptr<SchedulerDatabase::RetrieveJob> { - // Find the next file to retrieve - // Get the tape pool and then tape - objectstore::RootEntry re(m_objectStore); - objectstore::ScopedSharedLock rel(re); - re.fetch(); +std::list<std::unique_ptr<SchedulerDatabase::RetrieveJob> > OStoreDB::RetrieveMount::getNextJobBatch(uint64_t filesRequested, + uint64_t bytesRequested, log::LogContext& logContext) { + // Find the next files to retrieve // First, check we should not forcibly go down. In such an occasion, we just find noting to do. // Get drive register { + // Get the archive queue + objectstore::RootEntry re(m_objectStore); + objectstore::ScopedSharedLock rel(re); + re.fetch(); objectstore::DriveRegister dr(re.getDriveRegisterAddress(), m_objectStore); ScopedSharedLock drl(dr); dr.fetch(); auto drs = dr.getDriveState(mountInfo.drive); if (!drs.desiredDriveState.up && drs.desiredDriveState.forceDown) { - logContext.log(log::INFO, "In OStoreDB::RetrieveMount::getNextJob(): returning no job as we are forcibly going down."); - return nullptr; + logContext.log(log::INFO, "In OStoreDB::RetrieveMount::getNextJobBatch(): returning no job as we are forcibly going down."); + return std::list<std::unique_ptr<SchedulerDatabase::RetrieveJob> >(); } } - auto rql = re.dumpRetrieveQueues(); - rel.release(); - std::string rqAddress; - for (auto & rqp: rql) { + // Now, we should repeatedly fetch jobs from the queue until we fulfilled the request or there is nothing to get form the + // queue anymore. + // Requests are left as-is on errors. We will keep a list of them to avoid re-accessing them in the same batch. + std::set<std::string> retrieveRequestsToSkip; + // Prepare the returned jobs that we might accumulate in several rounds. + std::list<std::unique_ptr<OStoreDB::RetrieveJob>> privateRet; + uint64_t currentBytes=0; + uint64_t currentFiles=0; + size_t iterationCount=0; + while (true) { + iterationCount++; + uint64_t beforeBytes=currentBytes; + uint64_t beforeFiles=currentFiles; + // Try and get access to a queue. + objectstore::RootEntry re(m_objectStore); + objectstore::ScopedSharedLock rel(re); + re.fetch(); + std::string rqAddress; + auto rql = re.dumpRetrieveQueues(); + for (auto & rqp : rql) { if (rqp.vid == mountInfo.vid) rqAddress = rqp.address; - } - // The retrieve queue is gone. There is no more job. - if (!rqAddress.size()) - return nullptr; - // Try and open the retrieve queue. It could be gone by now. - try { + } + if (!rqAddress.size()) break; + // try and lock the archive queue. Any failure from here on means the end of the getting jobs. objectstore::RetrieveQueue rq(rqAddress, m_objectStore); - objectstore::ScopedExclusiveLock rqlock; + objectstore::ScopedExclusiveLock rqLock; try { - rqlock.lock(rq); - rq.fetch(); - } catch (cta::exception::Exception & ex) { - // The queue is now absent. We can remove its reference in the root entry. - // A new queue could have been added in the mean time, and be non-empty. - // We will then fail to remove from the RootEntry (non-fatal). - // TODO: We still conclude that the queue is empty on this unlikely event. - // (cont'd): A better approach would be to retart the process of this function - // from scratch. - rel.lock(re); - re.fetch(); try { - re.removeRetrieveQueueAndCommit(mountInfo.vid); - } catch (RootEntry::RetrieveQueueNotEmpty & ex) { - // TODO: improve: if we fail here we could retry to fetch a job. - return nullptr; - } - } - // Pop jobs until we find one actually belonging to the queue. - // Any job not really belonging is an uncommitted pop, which we will - // re-do here. - while (rq.dumpJobs().size()) { - // First take a lock on and download the job - // If the request is not around anymore, we will just move the the next - // Prepare the return value - auto job=rq.dumpJobs().front(); - std::unique_ptr<OStoreDB::RetrieveJob> privateRet(new OStoreDB::RetrieveJob( - job.address, m_objectStore, m_catalogue, m_agentReference, *this)); - privateRet->selectedCopyNb = job.copyNb; - objectstore::ScopedExclusiveLock rrl; - try { - rrl.lock(privateRet->m_retrieveRequest); - privateRet->m_retrieveRequest.fetch(); - if(privateRet->m_retrieveRequest.getOwner() != rq.getAddressIfSet()) { - rq.removeJob(privateRet->m_retrieveRequest.getAddressIfSet()); - continue; + rqLock.lock(rq); + rel.release(); + rq.fetch(); + } catch (cta::exception::Exception & ex) { + // The queue is now absent. We can remove its reference in the root entry. + // A new queue could have been added in the mean time, and be non-empty. + // We will then fail to remove from the RootEntry (non-fatal). + if (rel.isLocked()) rel.release(); + ScopedExclusiveLock rexl(re); + re.fetch(); + try { + re.removeRetrieveQueueAndCommit(mountInfo.vid); + log::ScopedParamContainer params(logContext); + params.add("vid", mountInfo.vid) + .add("queueObject", rq.getAddressIfSet()); + logContext.log(log::INFO, "In RetrieveMount::getNextJobBatch(): de-referenced missing queue from root entry"); + } catch (RootEntry::ArchiveQueueNotEmpty & ex) { + // TODO: improve: if we fail here we could retry to fetch a job. + log::ScopedParamContainer params(logContext); + params.add("vid", mountInfo.vid) + .add("queueObject", rq.getAddressIfSet()) + .add("Message", ex.getMessageValue()); + logContext.log(log::INFO, "In RetrieveMount::getNextJobBatch(): could not de-referenced missing queue from root entry"); } - } catch (cta::exception::Exception &) { - // we failed to access the object. It might be missing. - // Just pop this job from the queue and move to the next. - rq.removeJob(privateRet->m_retrieveRequest.getAddressIfSet()); - // Commit in case we do not pass by again. - rq.commit(); continue; } - // Take ownership of the job - // Add to ownership - m_agentReference.addToOwnership(privateRet->m_retrieveRequest.getAddressIfSet(), m_objectStore); - // Make the ownership official - privateRet->m_retrieveRequest.setOwner(m_agentReference.getAgentAddress()); - privateRet->m_retrieveRequest.commit(); - // Remove the job from the archive queue - rq.removeJob(privateRet->m_retrieveRequest.getAddressIfSet()); - // We can commit and release the retrieve queue lock, we will only fill up - // memory structure from here on. + // We now have the queue. + { + log::ScopedParamContainer params(logContext); + params.add("vid", mountInfo.tapePool) + .add("queueObject", rq.getAddressIfSet()) + .add("queueSize", rq.dumpJobs().size()); + logContext.log(log::INFO, "In RetrieveMount::getNextJobBatch(): retrieve queue found."); + } + // We should build the list of jobs we intend to grab. We will attempt to + // dequeue them in one go, updating jobs in parallel. If some jobs turn out + // to not be there really, we will have to do several passes. + // We build directly the return value in the process. + auto candidateDumps=rq.dumpJobs(); + std::list<std::unique_ptr<OStoreDB::RetrieveJob>> candidateJobs; + // If we fail to find jobs in one round, we will exit. + while (candidateDumps.size() && currentBytes < bytesRequested && currentFiles < filesRequested) { + auto job=candidateDumps.front(); + candidateDumps.pop_front(); + // If we saw an archive request we could not pop nor cleanup (really bad case), we + // will disregard it for the rest of this getNextJobBatch call. We will re-consider + // in the next call. + if (!retrieveRequestsToSkip.count(job.address)) { + currentFiles++; + currentBytes+=job.size; + candidateJobs.emplace_back(new OStoreDB::RetrieveJob(job.address, m_objectStore, m_catalogue, m_logger, m_agentReference, *this)); + candidateJobs.back()->selectedCopyNb = job.copyNb; + } + } + { + log::ScopedParamContainer params(logContext); + params.add("vid", mountInfo.vid) + .add("queueObject", rq.getAddressIfSet()) + .add("candidatesCount", candidateJobs.size()) + .add("currentFiles", currentFiles) + .add("currentBytes", currentBytes) + .add("requestedFiles", filesRequested) + .add("requestedBytes", bytesRequested); + logContext.log(log::INFO, "In RetrieveMount::getNextJobBatch(): will process a set of candidate jobs."); + } + // We now have a batch of jobs to try and dequeue. Should not be empty. + // First add the jobs to the owned list of the agent. + std::list<std::string> addedJobs; + for (const auto &j: candidateJobs) addedJobs.emplace_back(j->m_retrieveRequest.getAddressIfSet()); + m_agentReference.addBatchToOwnership(addedJobs, m_objectStore); + // We can now attempt to switch the ownership of the jobs. Depending on the type of failure (if any) we + // will adapt the rest. + // First, start the parallel updates of jobs + std::list<std::unique_ptr<objectstore::RetrieveRequest::AsyncOwnerUpdater>> jobUpdates; + for (const auto &j: candidateJobs) jobUpdates.emplace_back( + j->m_retrieveRequest.asyncUpdateOwner(j->selectedCopyNb, m_agentReference.getAgentAddress(), rqAddress)); + // Now run through the results of the asynchronous updates. Non-sucess results come in the form of exceptions. + std::list<std::string> jobsToForget; // The jobs either absent or not owned, for which we should just remove references (agent). + std::list<std::string> jobsToDequeue; // The jobs that should not be queued anymore. All of them indeed (invalid or successfully poped). + std::list<std::unique_ptr<OStoreDB::RetrieveJob>> validatedJobs; // The jobs we successfully validated. + auto j=candidateJobs.begin(); // We will iterate on 2 lists... + auto ju=jobUpdates.begin(); + while (ju!=jobUpdates.end()) { + // Get the processing status of update + try { + (*ju)->wait(); + // Getting here means the update went through... We can proceed with removing the + // job from the queue, and populating the job to report in memory. + jobsToDequeue.emplace_back((*j)->m_retrieveRequest.getAddressIfSet()); + (*j)->archiveFile = (*ju)->getArchiveFile(); + (*j)->retrieveRequest = (*ju)->getRetrieveRequest(); + (*j)->m_jobOwned = true; + (*j)->m_mountId = mountInfo.mountId; + log::ScopedParamContainer params(logContext); + params.add("vid", mountInfo.vid) + .add("queueObject", rq.getAddressIfSet()) + .add("requestObject", (*j)->m_retrieveRequest.getAddressIfSet()) + .add("fileId", (*j)->archiveFile.archiveFileID); + logContext.log(log::INFO, "In RetrieveMount::getNextJobBatch(): popped one job"); + validatedJobs.emplace_back(std::move(*j)); + } catch (cta::exception::Exception & e) { + std::string debugType=typeid(e).name(); + if (typeid(e) == typeid(Backend::NoSuchObject) || + typeid(e) == typeid(objectstore::ArchiveRequest::WrongPreviousOwner)) { + // The object was not present or not owned, so we skip it. It should be removed from + // the queue. + jobsToDequeue.emplace_back((*j)->m_retrieveRequest.getAddressIfSet()); + // Log the event. + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", rq.getAddressIfSet()) + .add("requestObject", (*j)->m_retrieveRequest.getAddressIfSet()); + logContext.log(log::WARNING, "In ArchiveMount::getNextJobBatch(): skipped job not owned or not present."); + } else if (typeid(e) == typeid(Backend::CouldNotUnlock)) { + // We failed to unlock the object. The request was successfully updated, so we do own it. This is a non-fatal + // situation, so we just issue a warning. Removing the request from our agent ownership would + // orphan it. + log::ScopedParamContainer params(logContext); + int demangleStatus; + char * exceptionTypeStr = abi::__cxa_demangle(typeid(e).name(), nullptr, nullptr, &demangleStatus); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", rq.getAddressIfSet()) + .add("requestObject", (*j)->m_retrieveRequest.getAddressIfSet()); + if (!demangleStatus) { + params.add("exceptionType", exceptionTypeStr); + } else { + params.add("exceptionType", typeid(e).name()); + } + free(exceptionTypeStr); + exceptionTypeStr = nullptr; + params.add("message", e.getMessageValue()); + logContext.log(log::WARNING, "In ArchiveMount::getNextJobBatch(): Failed to unlock the request (lock expiration). Request remains selected."); + validatedJobs.emplace_back(std::move(*j)); + } else { + // This is not a success, yet we could not confirm the job status due to an unexpected error. + // We leave the queue as is. We forget about owning this job. This is an error. + log::ScopedParamContainer params(logContext); + int demangleStatus; + char * exceptionTypeStr = abi::__cxa_demangle(typeid(e).name(), nullptr, nullptr, &demangleStatus); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", rq.getAddressIfSet()) + .add("requestObject", (*j)->m_retrieveRequest.getAddressIfSet()); + if (!demangleStatus) { + params.add("exceptionType", exceptionTypeStr); + } else { + params.add("exceptionType", typeid(e).name()); + } + free(exceptionTypeStr); + exceptionTypeStr = nullptr; + params.add("message", e.getMessageValue()); + logContext.log(log::ERR, "In ArchiveMount::getNextJobBatch(): unexpected error. Leaving the job queued."); + jobsToForget.emplace_back((*j)->m_retrieveRequest.getAddressIfSet()); + retrieveRequestsToSkip.insert((*j)->m_retrieveRequest.getAddressIfSet()); + } + // This job is not for us. + jobsToForget.emplace_back((*j)->m_retrieveRequest.getAddressIfSet()); + // We also need to update the counts. + currentFiles--; + currentBytes-=(*j)->archiveFile.fileSize; + } + // In all cases: move to the nexts. + ju=jobUpdates.erase(ju); + j=candidateJobs.erase(j); + } + // All (most) jobs are now officially owned by our agent. We can hence remove them from the queue. + for (const auto &j: jobsToDequeue) rq.removeJob(j); + if (jobsToForget.size()) m_agentReference.removeBatchFromOwnership(addedJobs, m_objectStore); + // (Possibly intermediate) commit of the queue. We keep the lock for the moment. rq.commit(); - rqlock.release(); - privateRet->retrieveRequest = privateRet->m_retrieveRequest.getSchedulerRequest(); - privateRet->archiveFile = privateRet->m_retrieveRequest.getArchiveFile(); - privateRet->m_jobOwned = true; - privateRet->m_mountId = mountInfo.mountId; - return std::unique_ptr<SchedulerDatabase::RetrieveJob> (std::move(privateRet)); + // We can now add the validated jobs to the return value. + auto vj = validatedJobs.begin(); + while (vj != validatedJobs.end()) { + privateRet.emplace_back(std::move(*vj)); + vj=validatedJobs.erase(vj); + } + // Before going for another round, we can release the queue and delete it if we emptied it. + auto remainingJobs=rq.dumpJobs().size(); + rqLock.release(); + // If the queue is empty, we can get rid of it. + if (!remainingJobs) { + try { + // The queue should be removed as it is empty. + ScopedExclusiveLock rexl(re); + re.fetch(); + re.removeArchiveQueueAndCommit(mountInfo.tapePool); + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", rq.getAddressIfSet()); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): deleted empty queue"); + } catch (cta::exception::Exception &ex) { + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", rq.getAddressIfSet()) + .add("Message", ex.getMessageValue()); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): could not delete a presumably empty queue"); + } + } + // We can now summarize this round + { + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("queueObject", rq.getAddressIfSet()) + .add("filesAdded", currentFiles - beforeFiles) + .add("bytesAdded", currentBytes - beforeBytes) + .add("filesBefore", beforeFiles) + .add("bytesBefore", beforeBytes) + .add("filesAfter", currentFiles) + .add("bytesAfter", currentBytes) + .add("iterationCount", iterationCount); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): did one round of jobs retrieval."); + } + // We could be done now. + if (currentBytes >= bytesRequested || currentFiles >= filesRequested) + break; + // If we had exhausted the queue while selecting the jobs, we stop here, else we can go for another + // round. + if (!candidateDumps.size()) + break; + } catch (cta::exception::Exception & ex) { + log::ScopedParamContainer params (logContext); + params.add("exceptionMessage", ex.getMessageValue()); + logContext.log(log::ERR, "In OStoreDB::ArchiveMount::getNextJobBatch(): error (CTA exception) getting more jobs. Backtrace follows."); + logContext.logBacktrace(log::ERR, ex.backtrace()); + break; + } catch (std::exception & e) { + log::ScopedParamContainer params (logContext); + params.add("exceptionWhat", e.what()); + logContext.log(log::ERR, "In OStoreDB::ArchiveMount::getNextJobBatch(): error (std exception) getting more jobs."); + break; + } catch (...) { + logContext.log(log::ERR, "In OStoreDB::ArchiveMount::getNextJobBatch(): error (unknown exception) getting more jobs."); + break; } - return std::unique_ptr<SchedulerDatabase::RetrieveJob>(); - } catch (cta::exception::Exception & ex) { - return nullptr; } -} - -//------------------------------------------------------------------------------ -// OStoreDB::RetrieveMount::getNextJobBatch() -//------------------------------------------------------------------------------ -std::list<std::unique_ptr<SchedulerDatabase::RetrieveJob> > OStoreDB::RetrieveMount::getNextJobBatch(uint64_t filesRequested, - uint64_t bytesRequested, log::LogContext& logContext) { - throw cta::exception::Exception("In OStoreDB::RetrieveMount::getNextJobBatch(): not implemented."); + // We either ran out of jobs or fulfilled the requirements. Time to build up the reply. + // Log the outcome. + uint64_t nFiles=privateRet.size(); + uint64_t nBytes=0; + for (auto & j: privateRet) { + nBytes+=j->archiveFile.fileSize; + } + { + log::ScopedParamContainer params(logContext); + params.add("tapepool", mountInfo.tapePool) + .add("files", nFiles) + .add("bytes", nBytes); + logContext.log(log::INFO, "In ArchiveMount::getNextJobBatch(): jobs retrieval complete."); + } + // We can construct the return value. + std::list<std::unique_ptr<SchedulerDatabase::RetrieveJob> > ret; + for (auto & j: privateRet) ret.emplace_back(std::move(j)); + return ret; } @@ -2176,24 +2448,26 @@ void OStoreDB::ArchiveJob::bumpUpTapeFileCount(uint64_t newFileCount) { } //------------------------------------------------------------------------------ -// OStoreDB::ArchiveJob::succeed() +// OStoreDB::ArchiveJob::asyncSucceed() //------------------------------------------------------------------------------ -bool OStoreDB::ArchiveJob::succeed() { - // Lock the request and set the job as successful. - objectstore::ScopedExclusiveLock atfrl(m_archiveRequest); - m_archiveRequest.fetch(); - std::string atfrAddress = m_archiveRequest.getAddressIfSet(); - bool lastJob=m_archiveRequest.setJobSuccessful(tapeFile.copyNb); - if (lastJob) { - m_archiveRequest.remove(); - } else { - m_archiveRequest.commit(); +void OStoreDB::ArchiveJob::asyncSucceed() { + m_jobUpdate.reset(m_archiveRequest.asyncUpdateJobSuccessful(tapeFile.copyNb)); +} + +//------------------------------------------------------------------------------ +// OStoreDB::ArchiveJob::checkSucceed() +//------------------------------------------------------------------------------ +bool OStoreDB::ArchiveJob::checkSucceed() { + m_jobUpdate->wait(); + if (m_jobUpdate->m_isLastJob) { + m_archiveRequest.resetValues(); } // We no more own the job (which could be gone) m_jobOwned = false; // Remove ownership from agent - m_agentReference.removeFromOwnership(atfrAddress, m_objectStore); - return lastJob; + const std::string atfrAddress = m_archiveRequest.getAddressIfSet(); + m_agentReference.removeFromOwnership(atfrAddress, m_objectStore); + return m_jobUpdate->m_isLastJob; } //------------------------------------------------------------------------------ @@ -2201,6 +2475,7 @@ bool OStoreDB::ArchiveJob::succeed() { //------------------------------------------------------------------------------ OStoreDB::ArchiveJob::~ArchiveJob() { if (m_jobOwned) { + utils::Timer t; log::LogContext lc(m_logger); // Return the job to the pot if we failed to handle it. objectstore::ScopedExclusiveLock atfrl(m_archiveRequest); @@ -2212,6 +2487,7 @@ OStoreDB::ArchiveJob::~ArchiveJob() { params.add("agentObject", m_agentReference.getAgentAddress()) .add("jobObject", m_archiveRequest.getAddressIfSet()); m_agentReference.removeFromOwnership(m_archiveRequest.getAddressIfSet(), m_objectStore); + params.add("schedulerDbTime", t.secs()); lc.log(log::INFO, "In OStoreDB::ArchiveJob::~ArchiveJob(): removed job from ownership after garbage collection."); } } @@ -2220,9 +2496,9 @@ OStoreDB::ArchiveJob::~ArchiveJob() { // OStoreDB::RetrieveJob::RetrieveJob() //------------------------------------------------------------------------------ OStoreDB::RetrieveJob::RetrieveJob(const std::string& jobAddress, - objectstore::Backend& os, catalogue::Catalogue & c, objectstore::AgentReference& ar, + objectstore::Backend& os, catalogue::Catalogue & c, log::Logger &l, objectstore::AgentReference& ar, OStoreDB::RetrieveMount& rm): m_jobOwned(false), - m_objectStore(os), m_catalogue(c), m_agentReference(ar), m_retrieveRequest(jobAddress, os), + m_objectStore(os), m_catalogue(c), m_logger(l), m_agentReference(ar), m_retrieveRequest(jobAddress, os), m_retrieveMount(rm) { } //------------------------------------------------------------------------------ @@ -2299,74 +2575,22 @@ void OStoreDB::RetrieveJob::fail(log::LogContext &logContext) { // OStoreDB::RetrieveJob::~RetrieveJob() //------------------------------------------------------------------------------ OStoreDB::RetrieveJob::~RetrieveJob() { -// if (m_jobOwned) { -// // Re-queue the job entirely if we failed to handle it. -// try { -// // We now need to select the tape from which we will migrate next. This should -// // be the tape with the most jobs already queued. -// // TODO: this will have to look at tape statuses on the long run as well -// uint16_t selectedCopyNumber; -// uint64_t bestTapeQueuedBytes; -// std::string selectedVid; -// std::string selectedTapeAddress; -// objectstore::ScopedExclusiveLock rtfrl(m_rtfr); -// m_rtfr.fetch(); -// auto jl=m_rtfr.dumpJobs(); -// { -// // First tape copy is always better than nothing. -// auto tc=jl.begin(); -// selectedCopyNumber = tc->copyNb; -// selectedVid = tc->tape; -// selectedTapeAddress = tc->tapeAddress; -// // Get info for the tape. -// { -// objectstore::Tape t(tc->tapeAddress, m_objectStore); -// objectstore::ScopedSharedLock tl(t); -// t.fetch(); -// bestTapeQueuedBytes = t.getJobsSummary().bytes; -// } -// tc++; -// // Compare with the next ones -// for (;tc!=jl.end(); tc++) { -// objectstore::Tape t(tc->tapeAddress, m_objectStore); -// objectstore::ScopedSharedLock tl(t); -// t.fetch(); -// if (t.getJobsSummary().bytes > bestTapeQueuedBytes) { -// bestTapeQueuedBytes = t.getJobsSummary().bytes; -// selectedCopyNumber = tc->copyNb; -// selectedVid = tc->tape; -// selectedTapeAddress = tc->tapeAddress; -// } -// } -// } -// // We now can enqueue the request on this most promising tape. -// { -// objectstore::Tape tp(selectedTapeAddress, m_objectStore); -// ScopedExclusiveLock tpl(tp); -// tp.fetch(); -// objectstore::RetrieveToFileRequest::JobDump jd; -// jd.copyNb = selectedCopyNumber; -// jd.tape = selectedVid; -// jd.tapeAddress = selectedTapeAddress; -// tp.addJob(jd, m_rtfr.getAddressIfSet(), m_rtfr.getSize(), m_rtfr.getPriority(), m_rtfr.getEntryLog().time); -// tp.commit(); -// } -// // The request is now fully set. It belongs to the tape. -// std::string previousOwner = m_rtfr.getOwner(); -// m_rtfr.setOwner(selectedTapeAddress); -// m_rtfr.commit(); -// // And remove reference from the agent (if it was owned by an agent) -// try { -// if (!previousOwner.size()) -// return; -// objectstore::Agent agent(previousOwner, m_objectStore); -// objectstore::ScopedExclusiveLock al(agent); -// agent.fetch(); -// agent.removeFromOwnership(m_rtfr.getAddressIfSet()); -// agent.commit(); -// } catch (...) {} -// } catch (...) {} -// } + if (m_jobOwned) { + utils::Timer t; + log::LogContext lc(m_logger); + // Return the job to the pot if we failed to handle it. + objectstore::ScopedExclusiveLock rr(m_retrieveRequest); + m_retrieveRequest.fetch(); + m_retrieveRequest.garbageCollect(m_agentReference.getAgentAddress(), m_agentReference, lc, m_catalogue); + rr.release(); + // Remove ownership from agent + log::ScopedParamContainer params(lc); + params.add("agentObject", m_agentReference.getAgentAddress()) + .add("jobObject", m_retrieveRequest.getAddressIfSet()); + m_agentReference.removeFromOwnership(m_retrieveRequest.getAddressIfSet(), m_objectStore); + params.add("schdulerDbTime", t.secs()); + lc.log(log::INFO, "In OStoreDB::RetrieveJob::~RetrieveJob(): removed job from ownership after garbage collection."); + } } //------------------------------------------------------------------------------ @@ -2374,6 +2598,7 @@ OStoreDB::RetrieveJob::~RetrieveJob() { //------------------------------------------------------------------------------ void OStoreDB::RetrieveJob::succeed() { // Lock the request and set the request as successful (delete it). + utils::Timer t; objectstore::ScopedExclusiveLock rtfrl(m_retrieveRequest); m_retrieveRequest.fetch(); std::string rtfrAddress = m_retrieveRequest.getAddressIfSet(); @@ -2381,6 +2606,11 @@ void OStoreDB::RetrieveJob::succeed() { m_jobOwned = false; // Remove ownership form the agent m_agentReference.removeFromOwnership(rtfrAddress, m_objectStore); + log::LogContext lc(m_logger); + log::ScopedParamContainer params(lc); + params.add("requestObject", rtfrAddress) + .add("schedulerDbTime", t.secs()); + lc.log(log::INFO, "In RetrieveJob::succeed(): deleted completed retrieve request."); } diff --git a/scheduler/OStoreDB/OStoreDB.hpp b/scheduler/OStoreDB/OStoreDB.hpp index ae73b7a016b686dca7f81c3c083ac0060996e8a0..df6da7fc8b94ad7ec7182bbf2f68bd6dde19b6e6 100644 --- a/scheduler/OStoreDB/OStoreDB.hpp +++ b/scheduler/OStoreDB/OStoreDB.hpp @@ -109,7 +109,12 @@ private: * @param tmdi The TapeMountDecisionInfo where to store the data. * @param re A RootEntry object that should be locked and fetched. */ - void fetchMountInfo(SchedulerDatabase::TapeMountDecisionInfo &tmdi, objectstore::RootEntry &re); + enum class FetchFlavour: bool { + lock, + noLock + }; + void fetchMountInfo(SchedulerDatabase::TapeMountDecisionInfo &tmdi, objectstore::RootEntry &re, + FetchFlavour fetchFlavour=FetchFlavour::lock); public: std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo> getMountInfo() override; std::unique_ptr<SchedulerDatabase::TapeMountDecisionInfo> getMountInfoNoLock() override; @@ -140,8 +145,9 @@ public: public: CTA_GENERATE_EXCEPTION_CLASS(JobNowOwned); CTA_GENERATE_EXCEPTION_CLASS(NoSuchJob); - bool succeed() override; void fail(log::LogContext & lc) override; + void asyncSucceed() override; + bool checkSucceed() override; void bumpUpTapeFileCount(uint64_t newFileCount) override; ~ArchiveJob() override; private: @@ -156,20 +162,21 @@ public: objectstore::AgentReference & m_agentReference; objectstore::ArchiveRequest m_archiveRequest; ArchiveMount & m_archiveMount; + std::unique_ptr<objectstore::ArchiveRequest::AsyncJobSuccessfulUpdater> m_jobUpdate; }; /* === Retrieve Mount handling ============================================ */ class RetrieveMount: public SchedulerDatabase::RetrieveMount { friend class TapeMountDecisionInfo; private: - RetrieveMount(objectstore::Backend &, objectstore::AgentReference &, catalogue::Catalogue &); + RetrieveMount(objectstore::Backend &, objectstore::AgentReference &, catalogue::Catalogue &, log::Logger &); objectstore::Backend & m_objectStore; - objectstore::AgentReference & m_agentReference; catalogue::Catalogue & m_catalogue; + log::Logger & m_logger; + objectstore::AgentReference & m_agentReference; public: const MountInfo & getMountInfo() override; std::list<std::unique_ptr<RetrieveJob> > getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested, log::LogContext& logContext) override; - std::unique_ptr<RetrieveJob> getNextJob(log::LogContext & logContext) override; void complete(time_t completionTime) override; void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime) override; void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) override; @@ -185,12 +192,13 @@ public: virtual void fail(log::LogContext &) override; virtual ~RetrieveJob() override; private: - RetrieveJob(const std::string &, objectstore::Backend &, catalogue::Catalogue &, - objectstore::AgentReference &, RetrieveMount &); + RetrieveJob(const std::string &, objectstore::Backend &, catalogue::Catalogue &, + log::Logger &, objectstore::AgentReference &, RetrieveMount &); bool m_jobOwned; uint64_t m_mountId; objectstore::Backend & m_objectStore; catalogue::Catalogue & m_catalogue; + log::Logger & m_logger; objectstore::AgentReference & m_agentReference; objectstore::RetrieveRequest m_retrieveRequest; OStoreDB::RetrieveMount & m_retrieveMount; diff --git a/scheduler/RetrieveMount.cpp b/scheduler/RetrieveMount.cpp index bfd2740a1fe94e890ac2254c0fc198364a38a8ac..141877d9f49be7b6240ed83fb4cccb69a1b9d9df 100644 --- a/scheduler/RetrieveMount.cpp +++ b/scheduler/RetrieveMount.cpp @@ -34,28 +34,28 @@ cta::RetrieveMount::RetrieveMount( } //------------------------------------------------------------------------------ -// getMountType +// getMountType() //------------------------------------------------------------------------------ cta::common::dataStructures::MountType cta::RetrieveMount::getMountType() const{ return cta::common::dataStructures::MountType::Retrieve; } //------------------------------------------------------------------------------ -// getNbFiles +// getNbFiles() //------------------------------------------------------------------------------ uint32_t cta::RetrieveMount::getNbFiles() const { return m_dbMount->nbFilesCurrentlyOnTape; } //------------------------------------------------------------------------------ -// getVid +// getVid() //------------------------------------------------------------------------------ std::string cta::RetrieveMount::getVid() const{ return m_dbMount->mountInfo.vid; } //------------------------------------------------------------------------------ -// getMountTransactionId +// getMountTransactionId() //------------------------------------------------------------------------------ std::string cta::RetrieveMount::getMountTransactionId() const{ std::stringstream id; @@ -66,25 +66,28 @@ std::string cta::RetrieveMount::getMountTransactionId() const{ } //------------------------------------------------------------------------------ -// getNextJob +// getNextJobBatch() //------------------------------------------------------------------------------ -std::unique_ptr<cta::RetrieveJob> cta::RetrieveMount::getNextJob(log::LogContext & logContext) { +std::list<std::unique_ptr<cta::RetrieveJob> > cta::RetrieveMount::getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested, + log::LogContext& logContext) { if (!m_sessionRunning) - throw SessionNotRunning("In RetrieveMount::getNextJob(): trying to get job from complete/not started session"); + throw SessionNotRunning("In RetrieveMount::getNextJobBatch(): trying to get job from complete/not started session"); // Try and get a new job from the DB - std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> dbJob(m_dbMount->getNextJob(logContext).release()); - if (!dbJob.get()) - return std::unique_ptr<cta::RetrieveJob>(); - // We have something to retrieve: prepare the response - std::unique_ptr<cta::RetrieveJob> ret (new RetrieveJob(*this, - dbJob->retrieveRequest, dbJob->archiveFile, dbJob->selectedCopyNb, - PositioningMethod::ByBlock)); - ret->m_dbJob.reset(dbJob.release()); + std::list<std::unique_ptr<cta::SchedulerDatabase::RetrieveJob>> dbJobBatch(m_dbMount->getNextJobBatch(filesRequested, + bytesRequested, logContext)); + std::list<std::unique_ptr<RetrieveJob>> ret; + // We prepare the response + for (auto & sdrj: dbJobBatch) { + ret.emplace_back(new RetrieveJob(*this, + sdrj->retrieveRequest, sdrj->archiveFile, sdrj->selectedCopyNb, + PositioningMethod::ByBlock)); + ret.back()->m_dbJob.reset(sdrj.release()); + } return ret; } //------------------------------------------------------------------------------ -// tapeComplete()) +// tapeComplete() //------------------------------------------------------------------------------ void cta::RetrieveMount::tapeComplete() { m_tapeRunning = false; @@ -101,7 +104,7 @@ void cta::RetrieveMount::tapeComplete() { } //------------------------------------------------------------------------------ -// diskComplete()) +// diskComplete() //------------------------------------------------------------------------------ void cta::RetrieveMount::diskComplete() { m_diskRunning = false; @@ -116,7 +119,7 @@ void cta::RetrieveMount::diskComplete() { } //------------------------------------------------------------------------------ -// abort()) +// abort() //------------------------------------------------------------------------------ void cta::RetrieveMount::abort() { diskComplete(); @@ -138,7 +141,7 @@ void cta::RetrieveMount::setTapeSessionStats(const castor::tape::tapeserver::dae } //------------------------------------------------------------------------------ -// bothSidesComplete()) +// bothSidesComplete() //------------------------------------------------------------------------------ bool cta::RetrieveMount::bothSidesComplete() { return !(m_diskRunning || m_tapeRunning); diff --git a/scheduler/RetrieveMount.hpp b/scheduler/RetrieveMount.hpp index 6a35c057e41ea5c6614b4c8b9a5a840ac5b2ebf3..1b6c577ac33570361e7659de0d5badb368054883 100644 --- a/scheduler/RetrieveMount.hpp +++ b/scheduler/RetrieveMount.hpp @@ -117,13 +117,19 @@ namespace cta { virtual bool bothSidesComplete(); CTA_GENERATE_EXCEPTION_CLASS(SessionNotRunning); + /** - * Job factory - * - * @return A unique_ptr to the next archive job or NULL if there are no more - * archive jobs left for this tape mount. - */ - virtual std::unique_ptr<RetrieveJob> getNextJob(log::LogContext & logContext); + * Batch job factory + * + * @param filesRequested the number of files requested + * @param bytesRequested the number of bytes requested + * @param logContext + * @return a list of unique_ptr to the next retrieve jobs. The list is empty + * when no more jobs can be found. Will return jobs (if available) until one + * of the 2 criteria is fulfilled. + */ + virtual std::list<std::unique_ptr<RetrieveJob>> getNextJobBatch(uint64_t filesRequested, + uint64_t bytesRequested, log::LogContext &logContext); /** * Destructor. diff --git a/scheduler/Scheduler.cpp b/scheduler/Scheduler.cpp index d4418220e5a903f15a670769aff8d673c7f1e37d..07807abe0e899ff13b8939286bf1d248ea6abfd5 100644 --- a/scheduler/Scheduler.cpp +++ b/scheduler/Scheduler.cpp @@ -55,20 +55,32 @@ Scheduler::~Scheduler() throw() { } //------------------------------------------------------------------------------ // ping //------------------------------------------------------------------------------ -void Scheduler::ping() { - m_db.ping(); +void Scheduler::ping(log::LogContext & lc) { + cta::utils::Timer t; m_catalogue.ping(); + auto catalogueTime = t.secs(cta::utils::Timer::resetCounter); + m_db.ping(); + auto schedulerDbTime = t.secs(); + log::ScopedParamContainer spc(lc); + spc.add("catalogueTime", catalogueTime) + .add("schedulerDbTime", schedulerDbTime); + lc.log(log::INFO, "In Scheduler::ping(): success."); } //------------------------------------------------------------------------------ // authorizeAdmin //------------------------------------------------------------------------------ -void Scheduler::authorizeAdmin(const common::dataStructures::SecurityIdentity &cliIdentity){ +void Scheduler::authorizeAdmin(const common::dataStructures::SecurityIdentity &cliIdentity, log::LogContext & lc){ + cta::utils::Timer t; if(!(m_catalogue.isAdmin(cliIdentity))) { std::stringstream msg; msg << "User: " << cliIdentity.username << " on host: " << cliIdentity.host << " is not authorized to execute CTA admin commands"; throw exception::UserError(msg.str()); } + auto catalogueTime = t.secs(); + log::ScopedParamContainer spc(lc); + spc.add("catalogueTime", catalogueTime); + lc.log(log::INFO, "In Scheduler::authorizeAdmin(): success."); } //------------------------------------------------------------------------------ @@ -129,7 +141,8 @@ void Scheduler::queueRetrieve( utils::Timer t; // Get the queue criteria const common::dataStructures::RetrieveFileQueueCriteria queueCriteria = - m_catalogue.prepareToRetrieveFile(instanceName, request.archiveFileID, request.requester); + m_catalogue.prepareToRetrieveFile(instanceName, request.archiveFileID, request.requester, lc); + auto catalogueTime = t.secs(cta::utils::Timer::resetCounter); std::string selectedVid = m_db.queueRetrieve(request, queueCriteria, lc); auto schedulerDbTime = t.secs(); log::ScopedParamContainer spc(lc); @@ -167,6 +180,7 @@ void Scheduler::queueRetrieve( .add("policyMaxDrives", queueCriteria.mountPolicy.maxDrivesAllowed) .add("policyMinAge", queueCriteria.mountPolicy.retrieveMinRequestAge) .add("policyPriority", queueCriteria.mountPolicy.retrievePriority) + .add("catalogueTime", catalogueTime) .add("schedulerDbTime", schedulerDbTime); lc.log(log::INFO, "Queued retrieve request"); } @@ -174,7 +188,7 @@ void Scheduler::queueRetrieve( //------------------------------------------------------------------------------ // deleteArchive //------------------------------------------------------------------------------ -void Scheduler::deleteArchive(const std::string &instanceName, const common::dataStructures::DeleteArchiveRequest &request) { +void Scheduler::deleteArchive(const std::string &instanceName, const common::dataStructures::DeleteArchiveRequest &request, log::LogContext & lc) { // We have different possible scenarios here. The file can be safe in the catalogue, // fully queued, or partially queued. // First, make sure the file is not queued anymore. @@ -192,8 +206,14 @@ void Scheduler::deleteArchive(const std::string &instanceName, const common::dat //} // We did delete the file from the queue. It hence might be absent from the catalogue. // Errors are not fatal here (so we filter them out). + try { - m_catalogue.deleteArchiveFile(instanceName, request.archiveFileID); + utils::Timer t; + m_catalogue.deleteArchiveFile(instanceName, request.archiveFileID, lc); + auto catalogueTime = t.secs(cta::utils::Timer::resetCounter); + log::ScopedParamContainer spc(lc); + spc.add("catalogueTime", catalogueTime); + lc.log(log::INFO, "In Scheduler::deleteArchive(): success."); } catch (exception::UserError &) {} } @@ -322,10 +342,16 @@ common::dataStructures::WriteTestResult Scheduler::write_autoTest(const common:: //------------------------------------------------------------------------------ // getDesiredDriveState //------------------------------------------------------------------------------ -common::dataStructures::DesiredDriveState Scheduler::getDesiredDriveState(const std::string& driveName) { +common::dataStructures::DesiredDriveState Scheduler::getDesiredDriveState(const std::string& driveName, log::LogContext & lc) { + utils::Timer t; auto driveStates = m_db.getDriveStates(); for (auto & d: driveStates) { if (d.driveName == driveName) { + auto schedulerDbTime = t.secs(); + log::ScopedParamContainer spc(lc); + spc.add("drive", driveName) + .add("schedulerDbTime", schedulerDbTime); + lc.log(log::INFO, "In Scheduler::getDesiredDriveState(): success."); return d.desiredDriveState; } } @@ -335,11 +361,19 @@ common::dataStructures::DesiredDriveState Scheduler::getDesiredDriveState(const //------------------------------------------------------------------------------ // setDesiredDriveState //------------------------------------------------------------------------------ -void Scheduler::setDesiredDriveState(const common::dataStructures::SecurityIdentity &cliIdentity, const std::string &driveName, const bool up, const bool force) { +void Scheduler::setDesiredDriveState(const common::dataStructures::SecurityIdentity &cliIdentity, const std::string &driveName, const bool up, const bool force, log::LogContext & lc) { + utils::Timer t; common::dataStructures::DesiredDriveState desiredDriveState; desiredDriveState.up = up; desiredDriveState.forceDown = force; m_db.setDesiredDriveState(driveName, desiredDriveState); + auto schedulerDbTime = t.secs(); + log::ScopedParamContainer spc(lc); + spc.add("drive", driveName) + .add("up", up?"up":"down") + .add("force", force?"yes":"no") + .add("schedulerDbTime", schedulerDbTime); + lc.log(log::INFO, "In Scheduler::setDesiredDriveState(): success."); } //------------------------------------------------------------------------------ @@ -384,8 +418,14 @@ std::list<common::dataStructures::RetrieveJob> Scheduler::getPendingRetrieveJobs //------------------------------------------------------------------------------ // getDriveStates //------------------------------------------------------------------------------ -std::list<common::dataStructures::DriveState> Scheduler::getDriveStates(const common::dataStructures::SecurityIdentity &cliIdentity) const { - return m_db.getDriveStates(); +std::list<common::dataStructures::DriveState> Scheduler::getDriveStates(const common::dataStructures::SecurityIdentity &cliIdentity, log::LogContext & lc) const { + utils::Timer t; + auto ret = m_db.getDriveStates(); + auto schedulerDbTime = t.secs(); + log::ScopedParamContainer spc(lc); + spc.add("schedulerDbTime", schedulerDbTime); + lc.log(log::INFO, "In Scheduler::getDriveStates(): success."); + return ret; } //------------------------------------------------------------------------------ @@ -473,12 +513,15 @@ std::unique_ptr<TapeMount> Scheduler::getNextMount(const std::string &logicalLib } catch (std::out_of_range &) { existingMounts = 0; } + uint32_t effectiveExistingMounts = 0; + if (m->type == common::dataStructures::MountType::Archive) effectiveExistingMounts = existingMounts; bool mountPassesACriteria = false; - if (m->bytesQueued / (1 + existingMounts) >= m_minBytesToWarrantAMount) + + if (m->bytesQueued / (1 + effectiveExistingMounts) >= m_minBytesToWarrantAMount) mountPassesACriteria = true; - if (m->filesQueued / (1 + existingMounts) >= m_minFilesToWarrantAMount) + if (m->filesQueued / (1 + effectiveExistingMounts) >= m_minFilesToWarrantAMount) mountPassesACriteria = true; - if (!existingMounts && ((time(NULL) - m->oldestJobStartTime) > m->minArchiveRequestAge)) + if (!effectiveExistingMounts && ((time(NULL) - m->oldestJobStartTime) > m->minArchiveRequestAge)) mountPassesACriteria = true; if (!mountPassesACriteria || existingMounts >= m->maxDrivesAllowed) { log::ScopedParamContainer params(lc); @@ -638,10 +681,15 @@ std::unique_ptr<TapeMount> Scheduler::getNextMount(const std::string &logicalLib return std::unique_ptr<TapeMount>(); } +//------------------------------------------------------------------------------ +// getQueuesAndMountSummaries +//------------------------------------------------------------------------------ std::list<common::dataStructures::QueueAndMountSummary> Scheduler::getQueuesAndMountSummaries(log::LogContext& lc) { std::list<common::dataStructures::QueueAndMountSummary> ret; // Extract relevant information from the object store. + utils::Timer t; auto mountDecisionInfo=m_db.getMountInfoNoLock(); + auto schedulerDbTime = t.secs(utils::Timer::resetCounter); auto & mdi __attribute__((unused)) = *mountDecisionInfo; for (auto & pm: mountDecisionInfo->potentialMounts) { // Find or create the relevant entry. @@ -723,6 +771,11 @@ std::list<common::dataStructures::QueueAndMountSummary> Scheduler::getQueuesAndM mountOrQueue.tapePool = t.tapePoolName; } } + auto respondPreparationTime = t.secs(); + log::ScopedParamContainer spc(lc); + spc.add("schedulerDbTime", schedulerDbTime) + .add("respondPreparationTime", respondPreparationTime); + lc.log(log::INFO, "In Scheduler::getQueuesAndMountSummaries(): success."); return ret; } diff --git a/scheduler/Scheduler.hpp b/scheduler/Scheduler.hpp index cb17951313d3d2d24e0bc3335e96d9c494a2f6b9..5d633f7c7d5c5fdcdee96868be30dbb709e82cfd 100644 --- a/scheduler/Scheduler.hpp +++ b/scheduler/Scheduler.hpp @@ -84,7 +84,7 @@ public: * Validates that the underlying storages are accessible * Lets the exception through in case of failure. */ - void ping(); + void ping(log::LogContext & lc); /** * Queue an archive request and return the CTA file ID. @@ -112,7 +112,8 @@ public: * Throws a (Non)RetryableError exception in case something else goes wrong with the request */ void deleteArchive(const std::string &instanceName, - const cta::common::dataStructures::DeleteArchiveRequest &request); + const cta::common::dataStructures::DeleteArchiveRequest &request, + log::LogContext & lc); /** * Cancel an ongoing retrieval. @@ -201,7 +202,7 @@ public: * @param driveName * @return The structure representing the desired states */ - common::dataStructures::DesiredDriveState getDesiredDriveState(const std::string &driveName); + common::dataStructures::DesiredDriveState getDesiredDriveState(const std::string &driveName, log::LogContext & lc); /** * Sets the desired drive state. This function is used by the front end to pass instructions to the @@ -213,8 +214,7 @@ public: * @param force indicates whether we want to force the drive to be up. */ //TODO: replace the 2 bools with a structure. void setDesiredDriveState(const cta::common::dataStructures::SecurityIdentity &cliIdentity, - const std::string &driveName, - const bool up, const bool force); + const std::string &driveName, const bool up, const bool force, log::LogContext & lc); /** * Reports the state of the drive to the object store. This information is then reported @@ -235,7 +235,7 @@ public: * @return A list of drive state structures. */ std::list<cta::common::dataStructures::DriveState> getDriveStates( - const cta::common::dataStructures::SecurityIdentity &cliIdentity) const; + const cta::common::dataStructures::SecurityIdentity &cliIdentity, log::LogContext & lc) const; /*============== Actual mount scheduling and queue status reporting ========*/ /** @@ -254,7 +254,7 @@ public: std::list<common::dataStructures::QueueAndMountSummary> getQueuesAndMountSummaries(log::LogContext & lc); /*============== Administrator management ==================================*/ - void authorizeAdmin(const cta::common::dataStructures::SecurityIdentity &cliIdentity); + void authorizeAdmin(const cta::common::dataStructures::SecurityIdentity &cliIdentity, log::LogContext & lc); private: diff --git a/scheduler/SchedulerDatabase.hpp b/scheduler/SchedulerDatabase.hpp index 217c0f251d93aaf80df0d752fae8027cd0898f18..89426ea1ed58d13c70b96af988323dee05bea536 100644 --- a/scheduler/SchedulerDatabase.hpp +++ b/scheduler/SchedulerDatabase.hpp @@ -185,9 +185,11 @@ public: std::string archiveReportURL; cta::common::dataStructures::ArchiveFile archiveFile; cta::common::dataStructures::TapeFile tapeFile; - /// Indicates a success to the DB. If this is the last job, return true. - virtual bool succeed() = 0; virtual void fail(log::LogContext & lc) = 0; + /// Indicates a success to the DB. + virtual void asyncSucceed() = 0; + /// Check a succeed job status. If this is the last job, return true. + virtual bool checkSucceed() = 0; virtual void bumpUpTapeFileCount(uint64_t newFileCount) = 0; virtual ~ArchiveJob() {} }; @@ -321,7 +323,6 @@ public: virtual const MountInfo & getMountInfo() = 0; virtual std::list<std::unique_ptr<RetrieveJob>> getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested, log::LogContext& logContext) = 0; - virtual std::unique_ptr<RetrieveJob> getNextJob(log::LogContext & logContext) = 0; virtual void complete(time_t completionTime) = 0; virtual void setDriveStatus(common::dataStructures::DriveStatus status, time_t completionTime) = 0; virtual void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) = 0; diff --git a/scheduler/SchedulerDatabaseTest.cpp b/scheduler/SchedulerDatabaseTest.cpp index e923a331a262589e5e3e0c223b225bdf7805c2e7..04bed21469f34c5d89d4efdd6429227f10f6b6b9 100644 --- a/scheduler/SchedulerDatabaseTest.cpp +++ b/scheduler/SchedulerDatabaseTest.cpp @@ -205,7 +205,8 @@ TEST_P(SchedulerDatabaseTest, createManyArchiveJobs) { if (aj.size()) { count++; //std::cout << aj->archiveFile.diskFileInfo.path << std::endl; - aj.front()->succeed(); + aj.front()->asyncSucceed(); + aj.front()->checkSucceed(); } else done = true; @@ -291,7 +292,8 @@ TEST_P(SchedulerDatabaseTest, createManyArchiveJobs) { if (aj.size()) { count++; //std::cout << aj->archiveFile.diskFileInfo.path << std::endl; - aj.front()->succeed(); + aj.front()->asyncSucceed(); + aj.front()->checkSucceed(); } else done = true; diff --git a/scheduler/SchedulerTest.cpp b/scheduler/SchedulerTest.cpp index eef9864aaac663ce5abb49e5d62e4abe2e0a7896..35bcc403a48334a5bf0d5147d2b955c1f8499306 100644 --- a/scheduler/SchedulerTest.cpp +++ b/scheduler/SchedulerTest.cpp @@ -459,7 +459,10 @@ TEST_P(SchedulerTest, archive_and_retrieve_new_file) { archiveJob->tapeFile.checksumValue = "1234abcd"; archiveJob->tapeFile.compressedSize = archiveJob->archiveFile.fileSize; archiveJob->tapeFile.copyNb = 1; - archiveJob->complete(); + archiveJob->validate(); + archiveJob->writeToCatalogue(); + archiveJob->asyncSetJobSucceed(); + archiveJob->checkAndAsyncReportComplete(); archiveJobBatch = archiveMount->getNextJobBatch(1,1,lc); ASSERT_EQ(0, archiveJobBatch.size()); archiveMount->complete(); @@ -514,11 +517,13 @@ TEST_P(SchedulerTest, archive_and_retrieve_new_file) { retrieveMount.reset(dynamic_cast<cta::RetrieveMount*>(mount.release())); ASSERT_NE((cta::RetrieveMount*)NULL, retrieveMount.get()); std::unique_ptr<cta::RetrieveJob> retrieveJob; - retrieveJob.reset(retrieveMount->getNextJob(lc).release()); + auto jobBatch = retrieveMount->getNextJobBatch(1,1,lc); + ASSERT_EQ(1, jobBatch.size()); + retrieveJob.reset(jobBatch.front().release()); ASSERT_NE((cta::RetrieveJob*)NULL, retrieveJob.get()); retrieveJob->complete(); - retrieveJob.reset(retrieveMount->getNextJob(lc).release()); - ASSERT_EQ((cta::RetrieveJob*)NULL, retrieveJob.get()); + jobBatch = retrieveMount->getNextJobBatch(1,1,lc); + ASSERT_EQ(0, jobBatch.size()); } } diff --git a/scheduler/testingMocks/MockArchiveJob.hpp b/scheduler/testingMocks/MockArchiveJob.hpp index 96375a20f783eb64d7011643fc2f2334cfadcc2f..53df1ac01bf2e807945c6fb93ced91301aee72d9 100644 --- a/scheduler/testingMocks/MockArchiveJob.hpp +++ b/scheduler/testingMocks/MockArchiveJob.hpp @@ -33,14 +33,43 @@ namespace cta { completes(0), failures(0) {} ~MockArchiveJob() throw() {} - - bool complete() override { + + void failed(const cta::exception::Exception& ex, log::LogContext & lc) override { + failures++; + } + + virtual void asyncSetJobSucceed() override { completes++; + } + virtual bool checkAndAsyncReportComplete() override { return false; + } + virtual void validate() override {} + virtual void writeToCatalogue() override {} + virtual catalogue::TapeFileWritten validateAndGetTapeFileWritten() override { + catalogue::TapeFileWritten fileReport; + fileReport.archiveFileId = archiveFile.archiveFileID; + fileReport.blockId = tapeFile.blockId; + fileReport.checksumType = tapeFile.checksumType; + fileReport.checksumValue = tapeFile.checksumValue; + fileReport.compressedSize = tapeFile.compressedSize; + fileReport.copyNb = tapeFile.copyNb; + fileReport.diskFileId = archiveFile.diskFileId; + fileReport.diskFileUser = archiveFile.diskFileInfo.owner; + fileReport.diskFileGroup = archiveFile.diskFileInfo.group; + fileReport.diskFilePath = archiveFile.diskFileInfo.path; + fileReport.diskFileRecoveryBlob = archiveFile.diskFileInfo.recoveryBlob; + fileReport.diskInstance = archiveFile.diskInstance; + fileReport.fSeq = tapeFile.fSeq; + fileReport.size = archiveFile.fileSize; + fileReport.storageClassName = archiveFile.storageClass; + fileReport.tapeDrive = "dummy"; + fileReport.vid = tapeFile.vid; + return fileReport; } - - void failed(const cta::exception::Exception& ex, log::LogContext & lc) override { + virtual void failed(const cta::exception::Exception& ex) { failures++; } + virtual void retry() {} }; } diff --git a/scheduler/testingMocks/MockRetrieveMount.hpp b/scheduler/testingMocks/MockRetrieveMount.hpp index e0af365e77d343d2e073ad74ff04ff4da60c14c6..62f77972b8503a845f9e4c409868bd8807091811 100644 --- a/scheduler/testingMocks/MockRetrieveMount.hpp +++ b/scheduler/testingMocks/MockRetrieveMount.hpp @@ -33,15 +33,23 @@ namespace cta { ~MockRetrieveMount() throw() { } - std::unique_ptr<cta::RetrieveJob> getNextJob(log::LogContext & logContext) override { + std::list<std::unique_ptr<cta::RetrieveJob> > getNextJobBatch(uint64_t filesRequested, + uint64_t bytesRequested, log::LogContext & logContext) override { + std::list<std::unique_ptr<cta::RetrieveJob> > ret; + // Count the attempt to get a file (even if not successful). getJobs++; - if(m_jobs.empty()) { - return std::unique_ptr<cta::RetrieveJob>(); - } else { - std::unique_ptr<cta::RetrieveJob> job = std::move(m_jobs.front()); + while (m_jobs.size()) { + ret.emplace_back(m_jobs.front().release()); m_jobs.pop_front(); - return job; + // Count the next attempt to get the file" + if (filesRequested <= 1 || bytesRequested <= ret.back()->archiveFile.fileSize) + break; + else + getJobs++; + bytesRequested -= ret.back()->archiveFile.fileSize; + filesRequested--; } + return ret; } virtual std::string getMountTransactionId() const override { diff --git a/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp b/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp index 8d6ccbdd249f6a82ea4c8aa7155c2996a20e922f..b97d8e5ec4c168cd5ebed4022c9639f404f15e21 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/DataTransferSession.cpp @@ -94,7 +94,7 @@ castor::tape::tapeserver::daemon::Session::EndOfSessionAction schedule: while (true) { try { - auto desiredState = m_scheduler.getDesiredDriveState(m_driveConfig.unitName); + auto desiredState = m_scheduler.getDesiredDriveState(m_driveConfig.unitName, lc); if (!desiredState.up) { downUpTransition = true; // We wait a bit before polling the scheduler again. @@ -119,7 +119,7 @@ schedule: if (!emptyDriveProbe.driveIsEmpty()) { m_scheduler.reportDriveStatus(m_driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); cta::common::dataStructures::SecurityIdentity securityIdentity; - m_scheduler.setDesiredDriveState(securityIdentity, m_driveConfig.unitName, false, false); + m_scheduler.setDesiredDriveState(securityIdentity, m_driveConfig.unitName, false, false, lc); lc.log(cta::log::ERR, "A tape was detected in the drive. Putting the drive back down."); goto schedule; } else { @@ -137,7 +137,7 @@ schedule: lc.logBacktrace(cta::log::ERR, e.backtrace()); m_scheduler.reportDriveStatus(m_driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); cta::common::dataStructures::SecurityIdentity cliId; - m_scheduler.setDesiredDriveState(cliId, m_driveConfig.unitName, false, false); + m_scheduler.setDesiredDriveState(cliId, m_driveConfig.unitName, false, false, lc); return MARK_DRIVE_AS_DOWN; } // No mount to be done found, that was fast... diff --git a/tapeserver/castor/tape/tapeserver/daemon/DataTransferSessionTest.cpp b/tapeserver/castor/tape/tapeserver/daemon/DataTransferSessionTest.cpp index 2b345975d8b76c17ac642d37f18f8466ac15783a..f934c54acfdcb25758a5ea6a18c997bae5c36a21 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/DataTransferSessionTest.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/DataTransferSessionTest.cpp @@ -434,7 +434,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionGooddayRecall) { driveInfo.host=="host"; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // 7) Create the data transfer session DataTransferConfig castorConf; @@ -627,7 +627,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionWrongRecall) { driveInfo.host=="host"; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // 7) Create the data transfer session DataTransferConfig castorConf; @@ -830,7 +830,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionRAORecall) { driveInfo.logicalLibrary=driveConfig.rawLibrarySlot; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // 7) Create the data transfer session DataTransferConfig castorConf; @@ -1004,7 +1004,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionNoSuchDrive) { driveInfo.host=="host"; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // 8) Create the data transfer session DataTransferConfig castorConf; @@ -1148,7 +1148,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionFailtoMount) { driveInfo.host=="host"; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // 8) Create the data transfer session DataTransferConfig castorConf; @@ -1275,7 +1275,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionGooddayMigration) { driveInfo.host=="host"; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // Create the data transfer session DataTransferConfig castorConf; @@ -1417,7 +1417,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionMissingFilesMigration) { driveInfo.host=="host"; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // Create the data transfer session DataTransferConfig castorConf; @@ -1553,7 +1553,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionTapeFullMigration) { driveInfo.host=="host"; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // Create the data transfer session DataTransferConfig castorConf; @@ -1703,7 +1703,7 @@ TEST_P(DataTransferSessionTest, DataTransferSessionTapeFullOnFlushMigration) { driveInfo.host=="host"; // We need to create the drive in the registry before being able to put it up. scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); - scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false); + scheduler.setDesiredDriveState(s_adminOnAdminHost, driveConfig.unitName, true, false, logContext); // Create the data transfer session DataTransferConfig castorConf; diff --git a/tapeserver/castor/tape/tapeserver/daemon/DiskWriteTaskTest.cpp b/tapeserver/castor/tape/tapeserver/daemon/DiskWriteTaskTest.cpp index f999dd375e87ccf8100ad9bd9618462ec33ea8cd..d1498733bfdd8177bdd5281bb663ce8dfdf47274 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/DiskWriteTaskTest.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/DiskWriteTaskTest.cpp @@ -42,7 +42,6 @@ namespace unitTests{ class TestingDatabaseRetrieveMount: public cta::SchedulerDatabase::RetrieveMount { const MountInfo & getMountInfo() override { throw std::runtime_error("Not implemented"); } std::list<std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> > getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested, cta::log::LogContext& logContext) override { throw std::runtime_error("Not implemented");} - std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> getNextJob(cta::log::LogContext &) override { throw std::runtime_error("Not implemented");} void complete(time_t completionTime) override { throw std::runtime_error("Not implemented"); } void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime) override { throw std::runtime_error("Not implemented"); } void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) override { throw std::runtime_error("Not implemented"); } diff --git a/tapeserver/castor/tape/tapeserver/daemon/DiskWriteThreadPoolTest.cpp b/tapeserver/castor/tape/tapeserver/daemon/DiskWriteThreadPoolTest.cpp index f9b7d49ccdedd66bebdae25fe2a170025de34646..d9bb8cfe65841de99aa760d307447337ebe5b5e3 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/DiskWriteThreadPoolTest.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/DiskWriteThreadPoolTest.cpp @@ -37,7 +37,6 @@ namespace unitTests{ class TestingDatabaseRetrieveMount: public cta::SchedulerDatabase::RetrieveMount { const MountInfo & getMountInfo() override { throw std::runtime_error("Not implemented"); } std::list<std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> > getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested, cta::log::LogContext& logContext) override { throw std::runtime_error("Not implemented");} - std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> getNextJob(cta::log::LogContext & logContext) override { throw std::runtime_error("Not implemented");} void complete(time_t completionTime) override { throw std::runtime_error("Not implemented"); } void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime) override { throw std::runtime_error("Not implemented"); } void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) override { throw std::runtime_error("Not implemented"); } diff --git a/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPacker.cpp b/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPacker.cpp index c5ad2e4609977047f1b73d3889c3f431b02b6064..0e004b22eeafe02fe01561efe3d5811b770b2cbe 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPacker.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPacker.cpp @@ -24,6 +24,7 @@ #include "castor/tape/tapeserver/daemon/MigrationReportPacker.hpp" #include "castor/tape/tapeserver/daemon/TaskWatchDog.hpp" #include "castor/tape/tapeserver/drive/DriveInterface.hpp" +#include "catalogue/TapeFileWritten.hpp" #include <memory> #include <numeric> @@ -205,59 +206,130 @@ void MigrationReportPacker::ReportFlush::execute(MigrationReportPacker& reportPa reportPacker.m_lc.log(cta::log::INFO,"Received a flush report from tape, but had no file to report to client. Doing nothing."); return; } - std::unique_ptr<cta::ArchiveJob> job; - try{ - while(!reportPacker.m_successfulArchiveJobs.empty()) { - // Get the next job to report and make sure we will not attempt to process it twice. - job = std::move(reportPacker.m_successfulArchiveJobs.front()); - reportPacker.m_successfulArchiveJobs.pop(); - if (!job.get()) continue; - cta::log::ScopedParamContainer params(reportPacker.m_lc); - params.add("fileId", job->archiveFile.archiveFileID) - .add("diskInstance", job->archiveFile.diskInstance) - .add("diskFileId", job->archiveFile.diskFileId) - .add("lastKnownDiskPath", job->archiveFile.diskFileInfo.path); - if (job->complete()) { - params.add("reportURL", job->reportURL()); - reportPacker.m_lc.log(cta::log::INFO,"Reported to the client a full file archival"); - } else { - reportPacker.m_lc.log(cta::log::INFO, "Recorded the partial migration of a file"); - } - job.reset(nullptr); - } - reportPacker.m_lc.log(cta::log::INFO,"Reported to the client that a batch of files was written on tape"); - } catch(const cta::exception::Exception& e){ - cta::log::ScopedParamContainer params(reportPacker.m_lc); - params.add("exceptionMessageValue", e.getMessageValue()); - if (job.get()) { - params.add("fileId", job->archiveFile.archiveFileID) - .add("diskInstance", job->archiveFile.diskInstance) - .add("diskFileId", job->archiveFile.diskFileId) - .add("lastKnownDiskPath", job->archiveFile.diskFileInfo.path) - .add("reportURL", job->reportURL()); - } - const std::string msg_error="An exception was caught trying to call reportMigrationResults"; - reportPacker.m_lc.log(cta::log::ERR, msg_error); - throw failedMigrationRecallResult(msg_error); - } catch(const std::exception& e){ - cta::log::ScopedParamContainer params(reportPacker.m_lc); - params.add("exceptionWhat", e.what()); - if (job.get()) { - params.add("fileId", job->archiveFile.archiveFileID) - .add("diskInstance", job->archiveFile.diskInstance) - .add("diskFileId", job->archiveFile.diskFileId) - .add("lastKnownDiskPath", job->archiveFile.diskFileInfo.path); - } - const std::string msg_error="An std::exception was caught trying to call reportMigrationResults"; - reportPacker.m_lc.log(cta::log::ERR, msg_error); - throw failedMigrationRecallResult(msg_error); - } + proceedJobsBatch(reportPacker,std::move(reportPacker.m_successfulArchiveJobs), reportPacker.m_lc); } else { // This is an abnormal situation: we should never flush after an error! reportPacker.m_lc.log(cta::log::ALERT,"Received a flush after an error: sending file errors to client"); } } +//------------------------------------------------------------------------------ +//ReportFlush::proceedJobsBatch +//------------------------------------------------------------------------------ +void MigrationReportPacker::ReportFlush::proceedJobsBatch(const MigrationReportPacker& reportPacker, std::queue<std::unique_ptr<cta::ArchiveJob> > successfulArchiveJobs, cta::log::LogContext &logContext){ + std::set<cta::catalogue::TapeFileWritten> tapeFilesWritten; + std::list<std::unique_ptr<cta::ArchiveJob> > validatedSuccessfulArchiveJobs; + std::unique_ptr<cta::ArchiveJob> job; + try{ + while(!successfulArchiveJobs.empty()) { + // Get the next job to report and make sure we will not attempt to process it twice. + job = std::move(successfulArchiveJobs.front()); + successfulArchiveJobs.pop(); + if (!job.get()) continue; + tapeFilesWritten.insert(job->validateAndGetTapeFileWritten()); + validatedSuccessfulArchiveJobs.emplace_back(std::move(job)); + job.reset(nullptr); + } + + updateCatalogueWithTapeFilesWritten(reportPacker, tapeFilesWritten, logContext); + asyncUpdateBackendWithJobsSucceeded(validatedSuccessfulArchiveJobs); + checkAndAsyncReportCompletedJobs(validatedSuccessfulArchiveJobs, logContext); + + logContext.log(cta::log::INFO,"Reported to the client that a batch of files was written on tape"); + } catch(const cta::exception::Exception& e){ + cta::log::ScopedParamContainer params(logContext); + params.add("exceptionMessageValue", e.getMessageValue()); + if (job.get()) { + params.add("fileId", job->archiveFile.archiveFileID) + .add("diskInstance", job->archiveFile.diskInstance) + .add("diskFileId", job->archiveFile.diskFileId) + .add("lastKnownDiskPath", job->archiveFile.diskFileInfo.path) + .add("reportURL", job->reportURL()); + } + const std::string msg_error="An exception was caught trying to call reportMigrationResults"; + logContext.log(cta::log::ERR, msg_error); + throw failedMigrationRecallResult(msg_error); + } catch(const std::exception& e){ + cta::log::ScopedParamContainer params(logContext); + params.add("exceptionWhat", e.what()); + if (job.get()) { + params.add("fileId", job->archiveFile.archiveFileID) + .add("diskInstance", job->archiveFile.diskInstance) + .add("diskFileId", job->archiveFile.diskFileId) + .add("lastKnownDiskPath", job->archiveFile.diskFileInfo.path); + } + const std::string msg_error="An std::exception was caught trying to call reportMigrationResults"; + logContext.log(cta::log::ERR, msg_error); + throw failedMigrationRecallResult(msg_error); + } +} + +//------------------------------------------------------------------------------ +//ReportFlush::asyncUpdateBackendWithJobsSucceeded +//------------------------------------------------------------------------------ +void MigrationReportPacker::ReportFlush::asyncUpdateBackendWithJobsSucceeded( + const std::list<std::unique_ptr<cta::ArchiveJob> > &validatedSuccessfulArchiveJobs) { + for (const auto &job: validatedSuccessfulArchiveJobs){ + job->asyncSetJobSucceed(); + } +} + +//------------------------------------------------------------------------------ +//ReportFlush::checkAndAsyncReportCompletedJobs +//------------------------------------------------------------------------------ +void MigrationReportPacker::ReportFlush::checkAndAsyncReportCompletedJobs( + std::list<std::unique_ptr<cta::ArchiveJob> > &validatedSuccessfulArchiveJobs, + cta::log::LogContext &logContext) { + std::list<std::unique_ptr <cta::ArchiveJob> > reportedArchiveJobs; + + for (auto &job: validatedSuccessfulArchiveJobs){ + cta::log::ScopedParamContainer params(logContext); + params.add("fileId", job->archiveFile.archiveFileID) + .add("diskInstance", job->archiveFile.diskInstance) + .add("diskFileId", job->archiveFile.diskFileId) + .add("lastKnownDiskPath", job->archiveFile.diskFileInfo.path); + logContext.log(cta::log::DEBUG, + "In MigrationReportPacker::ReportFlush::checkAndAsyncReportCompletedJobs()" + " check for async backend update finished"); + if(job->checkAndAsyncReportComplete()) { + params.add("reportURL", job->reportURL()); + reportedArchiveJobs.emplace_back(std::move(job)); + logContext.log(cta::log::INFO,"Sent to the client a full file archival"); + } else { + logContext.log(cta::log::INFO, "Recorded the partial migration of a file"); + } + } + + for (const auto &job: reportedArchiveJobs){ + try { + job->waitForReporting(); // should not be a deadWait as soon as we have a timeout on the xroot query + cta::log::ScopedParamContainer params(logContext); + params.add("reportURL", job->reportURL()); + logContext.log(cta::log::INFO,"Reported to the client a full file archival"); + } catch(cta::exception::Exception &ex) { + cta::log::ScopedParamContainer params(logContext); + params.add("reportURL", job->reportURL()); + params.add("errorMessage", ex.getMessage().str()); + logContext.log(cta::log::ERR,"Unsuccessful report to the client a full file archival:"); + } catch(...) { + throw; + } + } +} + +//------------------------------------------------------------------------------ +//ReportFlush::updateCatalogueWithTapeFilesWritten +//------------------------------------------------------------------------------ +void MigrationReportPacker::ReportFlush::updateCatalogueWithTapeFilesWritten( + const MigrationReportPacker &reportPacker, + const std::set<cta::catalogue::TapeFileWritten> &tapeFilesWritten, + cta::log::LogContext &logContext) { + reportPacker.m_archiveMount->updateCatalogueWithTapeFilesWritten(tapeFilesWritten); + cta::log::ScopedParamContainer params(logContext); + params.add("tapeFilesWritten", tapeFilesWritten.size()); + logContext.log(cta::log::INFO,"Catalog updated for batch of jobs"); +} + //------------------------------------------------------------------------------ //reportTapeFull()::execute //------------------------------------------------------------------------------ diff --git a/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPacker.hpp b/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPacker.hpp index dd014536c7a2e4ff5e068f3294ef558c2311164c..1bd7a6f2e051a7591bb4b1c5fca61a4b5457f381 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPacker.hpp +++ b/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPacker.hpp @@ -165,6 +165,18 @@ private: ReportFlush(drive::compressionStats compressStats):m_compressStats(compressStats){} void execute(MigrationReportPacker& reportPacker) override; + void proceedJobsBatch(const MigrationReportPacker& reportPacker, + std::queue<std::unique_ptr<cta::ArchiveJob> > successfulArchiveJobs, + cta::log::LogContext &log); + void asyncUpdateBackendWithJobsSucceeded( + const std::list<std::unique_ptr<cta::ArchiveJob> > &validatedSuccessfulArchiveJobs); + void checkAndAsyncReportCompletedJobs( + std::list<std::unique_ptr<cta::ArchiveJob> > &validatedSuccessfulArchiveJobs, + cta::log::LogContext &logContext); + void updateCatalogueWithTapeFilesWritten( + const MigrationReportPacker &reportPacker, + const std::set<cta::catalogue::TapeFileWritten> &tapeFilesWritten, + cta::log::LogContext &logContext); }; class ReportTapeFull: public Report { public: diff --git a/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPackerTest.cpp b/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPackerTest.cpp index 687e28a600292ed152befe6cd5e77479ffcd3d4a..908f5e8c7f5578a55bc23da328a63d95b0e1f05a 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPackerTest.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/MigrationReportPackerTest.cpp @@ -69,16 +69,42 @@ namespace unitTests { int & completes, int &failures): MockArchiveJob(am, catalogue), completesRef(completes), failuresRef(failures) {} - bool complete() override { + virtual void asyncSetJobSucceed() override { completesRef++; + } + + virtual bool checkAndAsyncReportComplete() override { return false; } + virtual void validate() override {} + virtual void writeToCatalogue() override {} + virtual cta::catalogue::TapeFileWritten validateAndGetTapeFileWritten() override { + cta::catalogue::TapeFileWritten fileReport; + fileReport.archiveFileId = archiveFile.archiveFileID; + fileReport.blockId = tapeFile.blockId; + fileReport.checksumType = tapeFile.checksumType; + fileReport.checksumValue = tapeFile.checksumValue; + fileReport.compressedSize = tapeFile.compressedSize; + fileReport.copyNb = tapeFile.copyNb; + fileReport.diskFileId = archiveFile.diskFileId; + fileReport.diskFileUser = archiveFile.diskFileInfo.owner; + fileReport.diskFileGroup = archiveFile.diskFileInfo.group; + fileReport.diskFilePath = archiveFile.diskFileInfo.path; + fileReport.diskFileRecoveryBlob = archiveFile.diskFileInfo.recoveryBlob; + fileReport.diskInstance = archiveFile.diskInstance; + fileReport.fSeq = tapeFile.fSeq; + fileReport.size = archiveFile.fileSize; + fileReport.storageClassName = archiveFile.storageClass; + fileReport.tapeDrive = std::string("testDrive"); + fileReport.vid = tapeFile.vid; + return fileReport; + } + void failed(const cta::exception::Exception& ex, cta::log::LogContext & lc) override { failuresRef++; } - private: int & completesRef; int & failuresRef; @@ -86,7 +112,29 @@ namespace unitTests { TEST_F(castor_tape_tapeserver_daemon_MigrationReportPackerTest, MigrationReportPackerNominal) { cta::MockArchiveMount tam(*m_catalogue); + + const std::string vid1 = "VTEST001"; + const std::string vid2 = "VTEST002"; + const std::string logicalLibraryName = "logical_library_name"; + const std::string tapePoolName = "tape_pool_name"; + const uint64_t capacityInBytes = (uint64_t)10 * 1000 * 1000 * 1000 * 1000; + const bool disabledValue = true; + const bool fullValue = false; + const std::string createTapeComment = "Create tape"; + cta::common::dataStructures::SecurityIdentity admin = cta::common::dataStructures::SecurityIdentity("admin","localhost"); + m_catalogue->createLogicalLibrary(admin, logicalLibraryName, "Create logical library"); + m_catalogue->createTapePool(admin, tapePoolName, 2, true, "Create tape pool"); + m_catalogue->createTape(admin, vid1, logicalLibraryName, tapePoolName, capacityInBytes, + disabledValue, fullValue, createTapeComment); + + cta::common::dataStructures::StorageClass storageClass; + storageClass.diskInstance = "disk_instance"; + storageClass.name = "storage_class"; + storageClass.nbCopies = 1; + storageClass.comment = "Create storage class"; + m_catalogue->createStorageClass(admin, storageClass); + ::testing::InSequence dummy; std::unique_ptr<cta::ArchiveJob> job1; int job1completes(0), job1failures(0); @@ -95,6 +143,25 @@ namespace unitTests { new MockArchiveJobExternalStats(tam, *m_catalogue, job1completes, job1failures)); job1.reset(mockJob.release()); } + job1->archiveFile.archiveFileID=1; + job1->archiveFile.diskInstance="disk_instance"; + job1->archiveFile.diskFileId="diskFileId1"; + job1->archiveFile.diskFileInfo.path="filePath1"; + job1->archiveFile.diskFileInfo.owner="testUser1"; + job1->archiveFile.diskFileInfo.group="testGroup1"; + job1->archiveFile.diskFileInfo.recoveryBlob="recoveryBlob1"; + job1->archiveFile.fileSize=1024; + job1->archiveFile.checksumType="md5"; + job1->archiveFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + job1->archiveFile.storageClass="storage_class"; + job1->tapeFile.vid="VTEST001"; + job1->tapeFile.fSeq=1; + job1->tapeFile.blockId=256; + job1->tapeFile.compressedSize=768; + job1->tapeFile.copyNb=1; + job1->tapeFile.checksumType="md5"; + job1->tapeFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + std::unique_ptr<cta::ArchiveJob> job2; int job2completes(0), job2failures(0); { @@ -102,7 +169,25 @@ namespace unitTests { new MockArchiveJobExternalStats(tam, *m_catalogue, job2completes, job2failures)); job2.reset(mockJob.release()); } - + job2->archiveFile.archiveFileID=2; + job2->archiveFile.diskInstance="disk_instance"; + job2->archiveFile.diskFileId="diskFileId2"; + job2->archiveFile.diskFileInfo.path="filePath2"; + job2->archiveFile.diskFileInfo.owner="testUser2"; + job2->archiveFile.diskFileInfo.group="testGroup2"; + job2->archiveFile.diskFileInfo.recoveryBlob="recoveryBlob2"; + job2->archiveFile.fileSize=1024; + job2->archiveFile.checksumType="md5"; + job2->archiveFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + job2->archiveFile.storageClass="storage_class"; + job2->tapeFile.vid="VTEST001"; + job2->tapeFile.fSeq=2; + job2->tapeFile.blockId=512; + job2->tapeFile.compressedSize=768; + job2->tapeFile.copyNb=1; + job2->tapeFile.checksumType="md5"; + job2->tapeFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + cta::log::StringLogger log("castor_tape_tapeserver_daemon_MigrationReportPackerNominal",cta::log::DEBUG); cta::log::LogContext lc(log); tapeserver::daemon::MigrationReportPacker mrp(&tam,lc); @@ -170,9 +255,31 @@ namespace unitTests { ASSERT_EQ(1, job3failures); } - TEST_F(castor_tape_tapeserver_daemon_MigrationReportPackerTest, MigrationReportPackerOneByteFile) { + TEST_F(castor_tape_tapeserver_daemon_MigrationReportPackerTest, MigrationReportPackerBadFile) { cta::MockArchiveMount tam(*m_catalogue); + + const std::string vid1 = "VTEST001"; + const std::string vid2 = "VTEST002"; + const std::string logicalLibraryName = "logical_library_name"; + const std::string tapePoolName = "tape_pool_name"; + const uint64_t capacityInBytes = (uint64_t)10 * 1000 * 1000 * 1000 * 1000; + const bool disabledValue = true; + const bool fullValue = false; + const std::string createTapeComment = "Create tape"; + cta::common::dataStructures::SecurityIdentity admin = cta::common::dataStructures::SecurityIdentity("admin","localhost"); + m_catalogue->createLogicalLibrary(admin, logicalLibraryName, "Create logical library"); + m_catalogue->createTapePool(admin, tapePoolName, 2, true, "Create tape pool"); + m_catalogue->createTape(admin, vid1, logicalLibraryName, tapePoolName, capacityInBytes, + disabledValue, fullValue, createTapeComment); + + cta::common::dataStructures::StorageClass storageClass; + storageClass.diskInstance = "disk_instance"; + storageClass.name = "storage_class"; + storageClass.nbCopies = 1; + storageClass.comment = "Create storage class"; + m_catalogue->createStorageClass(admin, storageClass); + ::testing::InSequence dummy; std::unique_ptr<cta::ArchiveJob> migratedBigFile; int migratedBigFileCompletes(0), migratedBigFileFailures(0); @@ -196,9 +303,62 @@ namespace unitTests { migratedNullFile.reset(mockJob.release()); } - migratedBigFile->archiveFile.fileSize=100000; - migratedFileSmall->archiveFile.fileSize=1; - migratedNullFile->archiveFile.fileSize=0; + migratedBigFile->archiveFile.archiveFileID=4; + migratedBigFile->archiveFile.diskInstance="disk_instance"; + migratedBigFile->archiveFile.diskFileId="diskFileId2"; + migratedBigFile->archiveFile.diskFileInfo.path="filePath2"; + migratedBigFile->archiveFile.diskFileInfo.owner="testUser2"; + migratedBigFile->archiveFile.diskFileInfo.group="testGroup2"; + migratedBigFile->archiveFile.diskFileInfo.recoveryBlob="recoveryBlob2"; + migratedBigFile->archiveFile.fileSize=100000; + migratedBigFile->archiveFile.checksumType="md5"; + migratedBigFile->archiveFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + migratedBigFile->archiveFile.storageClass="storage_class"; + migratedBigFile->tapeFile.vid="VTEST001"; + migratedBigFile->tapeFile.fSeq=1; + migratedBigFile->tapeFile.blockId=256; + migratedBigFile->tapeFile.compressedSize=768; + migratedBigFile->tapeFile.copyNb=1; + migratedBigFile->tapeFile.checksumType="md5"; + migratedBigFile->tapeFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + + migratedFileSmall->archiveFile.archiveFileID=5; + migratedFileSmall->archiveFile.diskInstance="disk_instance"; + migratedFileSmall->archiveFile.diskFileId="diskFileId3"; + migratedFileSmall->archiveFile.diskFileInfo.path="filePath3"; + migratedFileSmall->archiveFile.diskFileInfo.owner="testUser2"; + migratedFileSmall->archiveFile.diskFileInfo.group="testGroup2"; + migratedFileSmall->archiveFile.diskFileInfo.recoveryBlob="recoveryBlob2"; + migratedFileSmall->archiveFile.fileSize=1; + migratedFileSmall->archiveFile.checksumType="md5"; + migratedFileSmall->archiveFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + migratedFileSmall->archiveFile.storageClass="storage_class"; + migratedFileSmall->tapeFile.vid="VTEST001"; + migratedFileSmall->tapeFile.fSeq=2; + migratedFileSmall->tapeFile.blockId=512; + migratedFileSmall->tapeFile.compressedSize=1; + migratedFileSmall->tapeFile.copyNb=1; + migratedFileSmall->tapeFile.checksumType="md5"; + migratedFileSmall->tapeFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + + migratedNullFile->archiveFile.archiveFileID=6; + migratedNullFile->archiveFile.diskInstance="disk_instance"; + migratedNullFile->archiveFile.diskFileId="diskFileId4"; + migratedNullFile->archiveFile.diskFileInfo.path="filePath4"; + migratedNullFile->archiveFile.diskFileInfo.owner="testUser2"; + migratedNullFile->archiveFile.diskFileInfo.group="testGroup2"; + migratedNullFile->archiveFile.diskFileInfo.recoveryBlob="recoveryBlob2"; + migratedNullFile->archiveFile.fileSize=0; + migratedNullFile->archiveFile.checksumType="md5"; + migratedNullFile->archiveFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; + migratedNullFile->archiveFile.storageClass="storage_class"; + migratedNullFile->tapeFile.vid="VTEST001"; + migratedNullFile->tapeFile.fSeq=3; + migratedNullFile->tapeFile.blockId=768; + migratedNullFile->tapeFile.compressedSize=0; + migratedNullFile->tapeFile.copyNb=1; + migratedNullFile->tapeFile.checksumType="md5"; + migratedFileSmall->tapeFile.checksumValue="b170288bf1f61b26a648358866f4d6c6"; cta::log::StringLogger log("castor_tape_tapeserver_daemon_MigrationReportPackerOneByteFile",cta::log::DEBUG); cta::log::LogContext lc(log); @@ -216,10 +376,12 @@ namespace unitTests { mrp.waitThread(); std::string temp = log.getLog(); - ASSERT_NE(std::string::npos, temp.find("Reported to the client that a batch of files was written on tape")); - ASSERT_EQ(1, tam.completes); - ASSERT_EQ(1, migratedBigFileCompletes); - ASSERT_EQ(1, migratedFileSmallCompletes); - ASSERT_EQ(1, migratedNullFileCompletes); + ASSERT_NE(std::string::npos, temp.find("TapeFileWrittenEvent is invalid")); + ASSERT_NE(std::string::npos, temp.find("Successfully closed client's session " + "after the failed report MigrationResult")); + ASSERT_EQ(0, tam.completes); + ASSERT_EQ(0, migratedBigFileCompletes); + ASSERT_EQ(0, migratedFileSmallCompletes); + ASSERT_EQ(0, migratedNullFileCompletes); } } diff --git a/tapeserver/castor/tape/tapeserver/daemon/RecallTaskInjector.cpp b/tapeserver/castor/tape/tapeserver/daemon/RecallTaskInjector.cpp index 820153025128a368d4d5fe0b413ade16f9aa5ca4..bc07ee3f9045fe5d57aa127a584a192ea3917f05 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/RecallTaskInjector.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/RecallTaskInjector.cpp @@ -205,15 +205,9 @@ void RecallTaskInjector::injectBulkRecalls() { bool RecallTaskInjector::synchronousFetch() { try { - uint64_t files=0; - uint64_t bytes=0; - while(files<=m_maxFiles && bytes<=m_maxBytes) { - std::unique_ptr<cta::RetrieveJob> job=m_retrieveMount.getNextJob(m_lc); - if(!job.get()) break; - files++; - bytes+=job->archiveFile.fileSize; - m_jobs.emplace_back(job.release()); - } + auto jobsList = m_retrieveMount.getNextJobBatch(m_maxFiles, m_maxBytes, m_lc); + for (auto & j: jobsList) + m_jobs.emplace_back(j.release()); } catch (cta::exception::Exception & ex) { cta::log::ScopedParamContainer scoped(m_lc); scoped.add("transactionId", m_retrieveMount.getMountTransactionId()) @@ -289,16 +283,9 @@ void RecallTaskInjector::WorkerThread::run() break; } m_parent.m_lc.log(cta::log::DEBUG,"RecallJobInjector:run: about to call client interface"); - uint64_t files=0; - uint64_t bytes=0; - while(files<=req.filesRequested && bytes<=req.bytesRequested) { - std::unique_ptr<cta::RetrieveJob> job=m_parent.m_retrieveMount.getNextJob(m_parent.m_lc); - if(!job.get()) break; - files++; - bytes+=job->archiveFile.fileSize; - m_parent.m_jobs.emplace_back(job.release()); - } - + auto jobsList = m_parent.m_retrieveMount.getNextJobBatch(req.filesRequested, req.bytesRequested, m_parent.m_lc); + for (auto & j: jobsList) + m_parent.m_jobs.emplace_back(j.release()); LogContext::ScopedParam sp01(m_parent.m_lc, Param("transactionId", m_parent.m_retrieveMount.getMountTransactionId())); if (m_parent.m_jobs.empty()) { diff --git a/tapeserver/castor/tape/tapeserver/daemon/RecallTaskInjectorTest.cpp b/tapeserver/castor/tape/tapeserver/daemon/RecallTaskInjectorTest.cpp index 93567c66b45e2bbb5f0cfc8319862418d318f3d7..71c946d5e08e8711fd3b107c3f2e2db63a38336d 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/RecallTaskInjectorTest.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/RecallTaskInjectorTest.cpp @@ -132,7 +132,6 @@ namespace unitTests class TestingDatabaseRetrieveMount: public cta::SchedulerDatabase::RetrieveMount { const MountInfo & getMountInfo() override { throw std::runtime_error("Not implemented"); } std::list<std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> > getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested, cta::log::LogContext& logContext) override { throw std::runtime_error("Not implemented");} - std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> getNextJob(cta::log::LogContext & logContext) override { throw std::runtime_error("Not implemented");} void complete(time_t completionTime) override { throw std::runtime_error("Not implemented"); } void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime) override { throw std::runtime_error("Not implemented"); } void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) override { throw std::runtime_error("Not implemented"); } @@ -168,8 +167,8 @@ namespace unitTests tapeserver::daemon::RecallTaskInjector rti(mm, tapeRead, diskWrite, trm, maxNbJobsInjectedAtOnce, blockSize, lc); ASSERT_EQ(true, rti.synchronousFetch()); - ASSERT_EQ(maxNbJobsInjectedAtOnce+1, diskWrite.m_tasks.size()); - ASSERT_EQ(maxNbJobsInjectedAtOnce+1, tapeRead.m_tasks.size()); + ASSERT_EQ(maxNbJobsInjectedAtOnce, diskWrite.m_tasks.size()); + ASSERT_EQ(maxNbJobsInjectedAtOnce, tapeRead.m_tasks.size()); rti.startThreads(); rti.requestInjection(false); diff --git a/tapeserver/daemon/DriveHandler.cpp b/tapeserver/daemon/DriveHandler.cpp index 5f28a35425b65881c045912362851c635907f3ca..13c2d121c04971eaba80e87b03b59e42a5200949 100644 --- a/tapeserver/daemon/DriveHandler.cpp +++ b/tapeserver/daemon/DriveHandler.cpp @@ -97,10 +97,11 @@ const std::map<SessionState, DriveHandler::Timeout> DriveHandler::m_heartbeatTim //------------------------------------------------------------------------------ // The following structure represents the data movement timeouts for the session states involving // data movements and heartbeats. -// TODO: decide on values +// TODO: decide on values. TODO: bumped up to 15 minutes for the time being as the +// efficient retrieve requests pop is not implemented yet. const std::map<SessionState, DriveHandler::Timeout> DriveHandler::m_dataMovementTimeouts = { - {SessionState::Running, std::chrono::duration_cast<Timeout>(std::chrono::minutes(5))}, - {SessionState::DrainingToDisk, std::chrono::duration_cast<Timeout>(std::chrono::minutes(5))} + {SessionState::Running, std::chrono::duration_cast<Timeout>(std::chrono::minutes(15))}, + {SessionState::DrainingToDisk, std::chrono::duration_cast<Timeout>(std::chrono::minutes(15))} }; //------------------------------------------------------------------------------ @@ -861,6 +862,12 @@ int DriveHandler::runChild() { std::string hostname=cta::utils::getShortHostname(); + auto &lc=m_processManager.logContext(); + { + log::ScopedParamContainer params(lc); + params.add("objectStoreURL", m_tapedConfig.objectStoreURL.value()); + lc.log(log::DEBUG, "In DriveHandler::runChild(): will connect to object store backend."); + } // Before anything, we need to check we have access to the scheduler's central storages. std::unique_ptr<cta::objectstore::Backend> backend( cta::objectstore::BackendFactory::createBackend(m_tapedConfig.objectStoreURL.value()).release()); @@ -876,52 +883,62 @@ int DriveHandler::runChild() { try { std::string processName="DriveProcess-"; processName+=m_configLine.unitName; - backendPopulator.reset(new cta::objectstore::BackendPopulator(*backend, processName, m_processManager.logContext())); + log::ScopedParamContainer params(lc); + params.add("processName", processName); + lc.log(log::DEBUG, "In DriveHandler::runChild(): will create agent entry."); + backendPopulator.reset(new cta::objectstore::BackendPopulator(*backend, processName, lc)); } catch(cta::exception::Exception &ex) { - log::ScopedParamContainer param(m_processManager.logContext()); + log::ScopedParamContainer param(lc); param.add("errorMessage", ex.getMessageValue()); - m_processManager.logContext().log(log::CRIT, "In DriveHandler::runChild(): failed to instantiate agent entry. Reporting fatal error."); + lc.log(log::CRIT, "In DriveHandler::runChild(): failed to instantiate agent entry. Reporting fatal error."); driveHandlerProxy.reportState(tape::session::SessionState::Fatal, tape::session::SessionType::Undetermined, ""); sleep(1); return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; } std::unique_ptr<cta::catalogue::Catalogue> catalogue; try { + log::ScopedParamContainer params(lc); + params.add("fileCatalogConfigFile", m_tapedConfig.fileCatalogConfigFile.value()); + lc.log(log::DEBUG, "In DriveHandler::runChild(): will get catalogue login information."); const cta::rdbms::Login catalogueLogin = cta::rdbms::Login::parseFile(m_tapedConfig.fileCatalogConfigFile.value()); const uint64_t nbConns = 1; const uint64_t nbArchiveFileListingConns = 0; + lc.log(log::DEBUG, "In DriveHandler::runChild(): will connect to catalogue."); catalogue=cta::catalogue::CatalogueFactory::create(m_sessionEndContext.logger(), catalogueLogin, nbConns, nbArchiveFileListingConns); - osdb.reset(new cta::OStoreDBWithAgent(*backend, backendPopulator->getAgentReference(), *catalogue, m_processManager.logContext().logger())); + osdb.reset(new cta::OStoreDBWithAgent(*backend, backendPopulator->getAgentReference(), *catalogue, lc.logger())); } catch(cta::exception::Exception &ex) { - log::ScopedParamContainer param(m_processManager.logContext()); + log::ScopedParamContainer param(lc); param.add("errorMessage", ex.getMessageValue()); - m_processManager.logContext().log(log::CRIT, "In DriveHandler::runChild(): failed to instantiate catalogue. Reporting fatal error."); + lc.log(log::CRIT, "In DriveHandler::runChild(): failed to instantiate catalogue. Reporting fatal error."); driveHandlerProxy.reportState(tape::session::SessionState::Fatal, tape::session::SessionType::Undetermined, ""); sleep(1); return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; } + lc.log(log::DEBUG, "In DriveHandler::runChild(): will create scheduler."); cta::Scheduler scheduler(*catalogue, *osdb, m_tapedConfig.mountCriteria.value().maxFiles, m_tapedConfig.mountCriteria.value().maxBytes); // Before launching the transfer session, we validate that the scheduler is reachable. + lc.log(log::DEBUG, "In DriveHandler::runChild(): will ping scheduler."); try { - scheduler.ping(); + scheduler.ping(lc); } catch (cta::exception::Exception &ex) { - log::ScopedParamContainer param (m_processManager.logContext()); + log::ScopedParamContainer param (lc); param.add("errorMessage", ex.getMessageValue()); - m_processManager.logContext().log(log::CRIT, "In DriveHandler::runChild(): failed to ping central storage before session. Reporting fatal error."); + lc.log(log::CRIT, "In DriveHandler::runChild(): failed to ping central storage before session. Reporting fatal error."); driveHandlerProxy.reportState(tape::session::SessionState::Fatal, tape::session::SessionType::Undetermined, ""); return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; } + lc.log(log::DEBUG, "In DriveHandler::runChild(): will start agent heartbeat."); // The object store is accessible, let's turn the agent heartbeat on. - objectstore::AgentHeartbeatThread agentHeartbeat(backendPopulator->getAgentReference(), *backend, m_processManager.logContext().logger()); + objectstore::AgentHeartbeatThread agentHeartbeat(backendPopulator->getAgentReference(), *backend, lc.logger()); agentHeartbeat.startThread(); // 1) Special case first, if we crashed in a cleaner session, we put the drive down if (m_previousSession == PreviousSession::Crashed && m_previousType == SessionType::Cleanup) { - log::ScopedParamContainer params(m_processManager.logContext()); + log::ScopedParamContainer params(lc); params.add("driveUnit", m_configLine.unitName); - m_processManager.logContext().log(log::ERR, "In DriveHandler::runChild(): the cleaner session crashed. Putting the drive down."); + lc.log(log::ERR, "In DriveHandler::runChild(): the cleaner session crashed. Putting the drive down."); // Get hold of the scheduler. try { cta::common::dataStructures::DriveInfo driveInfo; @@ -930,25 +947,25 @@ int DriveHandler::runChild() { driveInfo.host=hostname; scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); cta::common::dataStructures::SecurityIdentity securityIdentity; - scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName, false, false); + scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName, false, false, lc); return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; } catch (cta::exception::Exception &ex) { - log::ScopedParamContainer param(m_processManager.logContext()); + log::ScopedParamContainer param(lc); param.add("errorMessage", ex.getMessageValue()); - m_processManager.logContext().log(log::CRIT, "In DriveHandler::runChild(): failed to set the drive down. Reporting fatal error."); + lc.log(log::CRIT, "In DriveHandler::runChild(): failed to set the drive down. Reporting fatal error."); driveHandlerProxy.reportState(tape::session::SessionState::Fatal, tape::session::SessionType::Undetermined, ""); sleep(1); return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; } } - // 2) In the previous session crashed, we might want to run a cleaner session, depending + // 2) If the previous session crashed, we might want to run a cleaner session, depending // on the previous state std::set<SessionState> statesRequiringCleaner = { SessionState::Mounting, SessionState::Running, SessionState::Unmounting }; if (m_previousSession == PreviousSession::Crashed && statesRequiringCleaner.count(m_previousState)) { if (!m_previousVid.size()) { - m_processManager.logContext().log(log::ERR, "In DriveHandler::runChild(): Should run cleaner but VID is missing. Putting the drive down."); + lc.log(log::ERR, "In DriveHandler::runChild(): Should run cleaner but VID is missing. Putting the drive down."); try { cta::common::dataStructures::DriveInfo driveInfo; driveInfo.driveName=m_configLine.unitName; @@ -956,12 +973,12 @@ int DriveHandler::runChild() { driveInfo.host=hostname; scheduler.reportDriveStatus(driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down); cta::common::dataStructures::SecurityIdentity securityIdentity; - scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName, false, false); + scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName, false, false, lc); return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; } catch (cta::exception::Exception &ex) { - log::ScopedParamContainer param(m_processManager.logContext()); + log::ScopedParamContainer param(lc); param.add("errorMessage", ex.getMessageValue()); - m_processManager.logContext().log(log::CRIT, "In DriveHandler::runChild(): failed to set the drive down. Reporting fatal error."); + lc.log(log::CRIT, "In DriveHandler::runChild(): failed to set the drive down. Reporting fatal error."); driveHandlerProxy.reportState(tape::session::SessionState::Fatal, tape::session::SessionType::Undetermined, ""); sleep(1); return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; @@ -969,25 +986,25 @@ int DriveHandler::runChild() { } // Log the decision { - log::ScopedParamContainer params(m_processManager.logContext()); + log::ScopedParamContainer params(lc); params.add("VID", m_previousVid) .add("driveUnit", m_configLine.unitName) .add("PreviousState", session::toString(m_sessionState)) .add("PreviousType", session::toString(m_sessionType)); - m_processManager.logContext().log(log::INFO, "In DriveHandler::runChild(): starting cleaner after crash with tape potentially loaded."); + lc.log(log::INFO, "In DriveHandler::runChild(): starting cleaner after crash with tape potentially loaded."); } // Capabilities management. cta::server::ProcessCap capUtils; // Mounting management. - cta::mediachanger::MediaChangerFacade mediaChangerFacade(m_processManager.logContext().logger()); + cta::mediachanger::MediaChangerFacade mediaChangerFacade(lc.logger()); castor::tape::System::realWrapper sWrapper; // TODO: the cleaner session does not yet report to the scheduler. // // Before launching the transfer session, we validate that the scheduler is reachable. // if (!scheduler.ping()) { -// m_processManager.logContext().log(log::CRIT, "In DriveHandler::runChild(): failed to ping central storage before cleaner. Reporting fatal error."); +// lc.log(log::CRIT, "In DriveHandler::runChild(): failed to ping central storage before cleaner. Reporting fatal error."); // driveHandlerProxy.reportState(tape::session::SessionState::Fatal, tape::session::SessionType::Undetermined, ""); // sleep(1); // return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; @@ -996,7 +1013,7 @@ int DriveHandler::runChild() { castor::tape::tapeserver::daemon::CleanerSession cleanerSession( capUtils, mediaChangerFacade, - m_processManager.logContext().logger(), + lc.logger(), m_configLine, sWrapper, m_previousVid, @@ -1010,7 +1027,7 @@ int DriveHandler::runChild() { cta::server::ProcessCap capUtils; // Mounting management. - cta::mediachanger::MediaChangerFacade mediaChangerFacade(m_processManager.logContext().logger()); + cta::mediachanger::MediaChangerFacade mediaChangerFacade(lc.logger()); castor::tape::System::realWrapper sWrapper; @@ -1037,9 +1054,9 @@ int DriveHandler::runChild() { // put the drive down. if (m_previousSession == PreviousSession::Initiating) { // Log that we put the drive's desired state to down and do it. - log::ScopedParamContainer params(m_processManager.logContext()); + log::ScopedParamContainer params(lc); params.add("Drive", m_configLine.unitName); - m_processManager.logContext().log(log::INFO, "Setting the drive down at daemon startup"); + lc.log(log::INFO, "Setting the drive down at daemon startup"); try { // Before setting the desired state as down, we have to make sure the drive exists in the registry. // this is done by reporting the drive as down first. @@ -1049,10 +1066,10 @@ int DriveHandler::runChild() { driveInfo.host=hostname; scheduler.reportDriveStatus(driveInfo, common::dataStructures::MountType::NoMount, common::dataStructures::DriveStatus::Down); cta::common::dataStructures::SecurityIdentity securityIdentity; - scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName, false /* down */, false /* no force down*/); + scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName, false /* down */, false /* no force down*/, lc); } catch (cta::exception::Exception & ex) { params.add("Message", ex.getMessageValue()); - m_processManager.logContext().log(log::CRIT, "In DriveHandler::runChild(): failed to set drive down"); + lc.log(log::CRIT, "In DriveHandler::runChild(): failed to set drive down"); // This is a fatal error (failure to access the scheduler). Shut daemon down. driveHandlerProxy.reportState(tape::session::SessionState::Fatal, tape::session::SessionType::Undetermined, ""); return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN; @@ -1061,7 +1078,7 @@ int DriveHandler::runChild() { castor::tape::tapeserver::daemon::DataTransferSession dataTransferSession( cta::utils::getShortHostname(), - m_processManager.logContext().logger(), + lc.logger(), sWrapper, m_configLine, mediaChangerFacade, diff --git a/tapeserver/daemon/GarbageCollectorHandler.cpp b/tapeserver/daemon/GarbageCollectorHandler.cpp index 9feb2cb56088f2f4e83d7fe55d71378ec95067bc..ceacf3118130bd230f924aec2569f19e19f95dc1 100644 --- a/tapeserver/daemon/GarbageCollectorHandler.cpp +++ b/tapeserver/daemon/GarbageCollectorHandler.cpp @@ -275,7 +275,7 @@ int GarbageCollectorHandler::runChild() { catalogue=cta::catalogue::CatalogueFactory::create(m_processManager.logContext().logger(), catalogueLogin, nbConns, nbArchiveFileListingConns); scheduler=make_unique<cta::Scheduler>(*catalogue, *osdb, 5, 2*1000*1000); //TODO: we have hardcoded the mount policy parameters here temporarily we will remove them once we know where to put them // Before launching the transfer session, we validate that the scheduler is reachable. - scheduler->ping(); + scheduler->ping(m_processManager.logContext()); } catch(cta::exception::Exception &ex) { { log::ScopedParamContainer param(m_processManager.logContext()); diff --git a/tests/cta-valgrindUnitTests.sh.in b/tests/cta-valgrindUnitTests.sh.in index 6a423b00b89fe1405ada945ce9db0ea342fcaaf5..b65af459392372df77d78c5925c2f7694dd98e61 100644 --- a/tests/cta-valgrindUnitTests.sh.in +++ b/tests/cta-valgrindUnitTests.sh.in @@ -9,7 +9,7 @@ valgrind --track-fds=yes --leak-check=full --demangle=yes --gen-suppressions=all --error-exitcode=1 --suppressions=/usr/share/cta-@CTA_VERSION@/unittest/valgrind.suppr \ /usr/bin/cta-unitTests -valgrind --tool=helgrind -v --demangle=yes --gen-suppressions=all --conflict-cache-size=30000000 \ +valgrind --tool=helgrind -v --demangle=yes --gen-suppressions=all --conflict-cache-size=10000000 \ --error-exitcode=1 --suppressions=/usr/share/cta-@CTA_VERSION@/unittest/helgrind.suppr \ /usr/bin/cta-unitTests @@ -17,6 +17,6 @@ valgrind --track-fds=yes --leak-check=full --demangle=yes --gen-suppressions=all --error-exitcode=1 --suppressions=/usr/share/cta-@CTA_VERSION@/unittest/valgrind.suppr \ --child-silent-after-fork=yes /usr/bin/cta-unitTests-multiProcess -valgrind --tool=helgrind -v --demangle=yes --gen-suppressions=all --conflict-cache-size=30000000 \ +valgrind --tool=helgrind -v --demangle=yes --gen-suppressions=all --conflict-cache-size=10000000 \ --error-exitcode=1 --suppressions=/usr/share/cta-@CTA_VERSION@/unittest/helgrind.suppr \ /usr/bin/cta-unitTests-multiProcess diff --git a/tests/scripts/500GB-file-stream.sh b/tests/scripts/500GB-file-stream.sh new file mode 100755 index 0000000000000000000000000000000000000000..a1806e045e7a027f84b105d0bd32fa56dda2ded1 --- /dev/null +++ b/tests/scripts/500GB-file-stream.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +TESTFILENAME=/var/tmp/cta-test-temporary-file-$$ +EOSDIR=/eos/dev/test/vlado/`date +%Y%m%d%H%M` + +export XRD_STREAMTIMEOUT=600 # increased from 60s +export XRD_TIMEOUTRESOLUTION=600 # increased from 15s + +# Usage +if ! [[ "0" -lt "$1" ]] +then + echo "Usage: $0 <filesize in bytes>" + exit -1 +fi + +TESTFILESIZE=$1 + +# Generate random file (make smaller (< 1 GB) files ASCII hence compressible (size not accurate though)) +if [ "$TESTFILESIZE" -lt "1000000000" ] +then + openssl rand $TESTFILESIZE -base64 -out $TESTFILENAME +else + openssl rand $TESTFILESIZE -out $TESTFILENAME +fi + +# Compute how many of these files would be needed to fill 100 GB +let FILESCOUNT=100000000000/$TESTFILESIZE + +echo "EOS directory: $EOSDIR, File size: $SIZE Files count: $FILESCOUNT" + +for i in `seq 1 $FILESCOUNT` +do + if [ "$TESTFILESIZE" -lt "1000000000" ] + then + # Make smaller (< 1 GB) source files unique in order to excercise different checksums + echo $i >> $TESTFILENAME + fi + xrdcp $TESTFILENAME root://eosctatape//$EOSDIR/file_$i && echo "File $EOSDIR/file_$i copied at "`date "+%Y-%m-%d %H:%M:%S"` +done + +# Cleanup +rm -f $TESTFILENAME diff --git a/tests/scripts/bulk-cta-retrieve.sh b/tests/scripts/bulk-cta-retrieve.sh new file mode 100755 index 0000000000000000000000000000000000000000..6cb12fcafc690a72e7386e3b7b8dd06c86debfb2 --- /dev/null +++ b/tests/scripts/bulk-cta-retrieve.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# this script retrieves all files from a given EOS directory that are d0::t1 + + + +EOS_MGM_URL=root://eosctatape.cern.ch +TEST_FILE_NAME=/var/tmp/cta-test-temporary-file-$$ +TEST_FILE_LIST=/var/tmp/cta-test-temporary-list-$$ +TEMPORARY_PID_FILE=/var/tmp/cta-test-temporary-pid-file-$$ +EOS_KEYTAB=/var/tmp/cta-test-temporary-kt-$$ + +TIMEOUT=72000 # pump up da jam + +SLEEP=15 + +export XRD_STREAMTIMEOUT=3600 # increased from 60s +export XRD_TIMEOUTRESOLUTION=3600 # increased from 15s + +# Usage +if [ "" == "$1" ] +then + echo "Usage: $0 <eosdirectory>" + exit -1 +fi + +EOS_DIR=$1 + +umask 077 + +USER_ID=`/usr/bin/id -u` +GROUP_ID=`/usr/bin/id -g` + + +echo "EOS directory: $EOS_DIR, User ID: $USER_ID, Group ID: $GROUP_ID, Test file name: $TEST_FILE_NAME, Test file list: $TEST_FILE_LIST" + + + +# Generate file list +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Creating file list $TEST_FILE_LIST for files in $EOS_DIR directory ...\n***" +/usr/bin/rm -f $TEST_FILE_LIST + +eos $EOS_MGM_URL ls -y $EOS_DIR > $TEST_FILE_LIST.all || exit -1 +grep ^d0::t1 $TEST_FILE_LIST.all | awk "{print \"$EOS_DIR/\" \$NF}" > $TEST_FILE_LIST +/usr/bin/sed -e 's/^/ls -y /' $TEST_FILE_LIST > $TEST_FILE_LIST.eosh # Prepare the file for EOS batch mode +/usr/bin/rm -f $TEST_FILE_LIST.all + +FILES_COUNT=`cat $TEST_FILE_LIST | wc -l` +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Files to be recalled: $FILES_COUNT \n***" + + +# Retrieve files from tape (using here hard coded CTA keytab - THIS IS NOT SAFE !!!) +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Retrieving tape replicas for $FILES_COUNT files in the $EOS_DIR directory from $TEST_FILE_LIST file list onto EOS disk ...\n***" +echo "0 u:cta g:cta n:cta-taped N:6425591835858577167 c:1496074683 e:0 f:0 k:b7825f9dd4d72952e429e342dd687aa8735411c29587ddd052613e33c0792e0b" > $EOS_KEYTAB +chmod 400 $EOS_KEYTAB +/usr/bin/xargs -a $TEST_FILE_LIST -I {} echo "{}?eos.ruid=$USER_ID&eos.rgid=$GROUP_ID" | XrdSecPROTOCOL=sss XrdSecSSSKT=$EOS_KEYTAB /usr/bin/xargs -n 5 -P50 /usr/bin/xrdfs eosctatape prepare -s + + +# Wait $TIMEOUT for files until they are recalled +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Waiting until the files are retrieved from tape (expecting d1::t1 in front of each file entry using the EOS ls -y command), timeout after $TIMEOUT seconds ...\n***" +FILES_TO_BE_RETRIEVED=1 +SECONDS_PASSED=0 +while [ "$FILES_TO_BE_RETRIEVED" -gt 0 ] +do +# FILES_TO_BE_RETRIEVED=`/usr/bin/xargs -a $TEST_FILE_LIST -L 1 -P 10 -I {} /usr/bin/eos ls -y {} | /usr/bin/grep -v '^d1::t1' -c` + FILES_TO_BE_RETRIEVED=`/usr/bin/eos --batch $EOS_MGM_URL $TEST_FILE_LIST.eosh | /usr/bin/grep '^d0::t1' -c` + CURRENT_TIME=`/usr/bin/date "+%Y-%m-%d %H:%M:%S"` + echo "$CURRENT_TIME Scanned $EOS_DIR using file list $TEST_FILE_LIST - files to be retrieved: $FILES_TO_BE_RETRIEVED" + sleep $SLEEP + let SECONDS_PASSED=$SECONDS_PASSED+$SLEEP + if [ "$SECONDS_PASSED" -gt "$TIMEOUT" ] + then + echo "$0: timing out after $TIMEOUT seconds, stopping the test." + exit -1 + fi +done + +# Cleanup +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +# /usr/bin/rm -f $TEST_FILE_NAME $TEST_FILE_LIST $TEST_FILE_LIST.eosh $EOS_KEYTAB + +exit 0 diff --git a/tests/scripts/bulk-drop-disk-replica.sh b/tests/scripts/bulk-drop-disk-replica.sh new file mode 100755 index 0000000000000000000000000000000000000000..ea26829ff8d9f8eafb3e1fa354f4024066de7e75 --- /dev/null +++ b/tests/scripts/bulk-drop-disk-replica.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# this script deletes all disk replicas from a given EOS directory that are dX::t1 ,,X>=1 + + + +EOS_MGM_URL=root://eosctatape.cern.ch +TEST_FILE_NAME=/var/tmp/cta-test-temporary-file-$$ +TEST_FILE_LIST=/var/tmp/cta-test-temporary-list-$$ +TEMPORARY_PID_FILE=/var/tmp/cta-test-temporary-pid-file-$$ +EOS_KEYTAB=/var/tmp/cta-test-temporary-kt-$$ +TAPE_FS_ID=65535 + +TIMEOUT=72000 # pump up da jam + +SLEEP=15 + +export XRD_STREAMTIMEOUT=3600 # increased from 60s +export XRD_TIMEOUTRESOLUTION=3600 # increased from 15s + +# Usage +if [ "" == "$1" ] +then + echo "Usage: $0 <eosdirectory>" + exit -1 +fi + +EOS_DIR=$1 + +umask 077 + +USER_ID=`/usr/bin/id -u` +GROUP_ID=`/usr/bin/id -g` + + +echo "EOS directory: $EOS_DIR, User ID: $USER_ID, Group ID: $GROUP_ID, Test file name: $TEST_FILE_NAME, Test file list: $TEST_FILE_LIST" + + + +# Generate file list +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Creating file list $TEST_FILE_LIST for files in $EOS_DIR directory ...\n***" +/usr/bin/rm -f $TEST_FILE_LIST + +eos $EOS_MGM_URL ls -y $EOS_DIR > $TEST_FILE_LIST.tmp || exit -1 +grep -v ^d0:: $TEST_FILE_LIST.tmp | grep -v '^\<d.::t0\>' | awk "{print \"$EOS_DIR/\" \$NF}" > $TEST_FILE_LIST +/usr/bin/rm -f $TEST_FILE_LIST.tmp + +FILES_COUNT=`cat $TEST_FILE_LIST | wc -l` +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Files for which to drop disk copy: $FILES_COUNT \n***" + +for FILE_PATH in `cat $TEST_FILE_LIST`; do + for DISK_FSID in `eos file info "${FILE_PATH}" -m | sed s/\ /'\n'/g | grep fsid | sed s/fsid=// | grep -v ${TAPE_FS_ID}`; do + #echo "deleting disk replica with fsid ${DISK_FSID} for ${FILE_PATH}" + if ! eos -r 0 0 file drop "${FILE_PATH}" ${DISK_FSID} >/dev/null; then + echo "failed to delete disk replica with fsid ${DISK_FSID} for ${FILE_PATH}" + fi + done +done + + +# Cleanup +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +# /usr/bin/rm -f $TEST_FILE_NAME $TEST_FILE_LIST + +exit 0 diff --git a/tests/scripts/many-files-archive-migrate-retrieve-delete.sh b/tests/scripts/many-files-archive-migrate-retrieve-delete.sh new file mode 100755 index 0000000000000000000000000000000000000000..201e7bba63e458d4e9c6ca6b1ccb51108c778844 --- /dev/null +++ b/tests/scripts/many-files-archive-migrate-retrieve-delete.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +FILES_COUNT=1000 +EOS_DIR=/eos/dev/test/`whoami`/`/usr/bin/date +%Y%m%d%H%M` +EOS_MGM_URL=root://eosctatape.cern.ch +TEST_FILE_NAME=/var/tmp/cta-test-temporary-file-$$ +TEST_FILE_LIST=/var/tmp/cta-test-temporary-list-$$ +TEMPORARY_PID_FILE=/var/tmp/cta-test-temporary-pid-file +EOS_KEYTAB=/var/tmp/cta-test-temporary-kt-$$ +TIMEOUT=7200 +SLEEP=15 + +export XRD_STREAMTIMEOUT=3600 # increased from 60s +export XRD_TIMEOUTRESOLUTION=3600 # increased from 15s + +# Usage +if ! [[ "0" -lt "$1" ]] +then + echo "Usage: $0 <filesize in bytes>" + exit -1 +fi + +TEST_FILE_SIZE=$1 + +umask 077 + +USER_ID=`/usr/bin/id -u` +GROUP_ID=`/usr/bin/id -g` + +# Generate random file (make smaller (< 1 GB) files ASCII hence compressible (size not accurate though)) +#/usr/bin/rm -f $TEST_FILE_NAME +#if [ "$TEST_FILE_SIZE" -lt "1000000000" ] +#then +# /usr/bin/openssl rand $TEST_FILE_SIZE -base64 -out $TEST_FILE_NAME +#else + /usr/bin/openssl rand $TEST_FILE_SIZE -out $TEST_FILE_NAME +#fi + +#echo "EOS directory: $EOS_DIR, File size: $TEST_FILE_SIZE, Files count: $FILES_COUNT, User ID: $USER_ID, Group ID: $GROUP_ID, Test file name: $TEST_FILE_NAME, Test file list: $TEST_FILE_LIST" + +# Generate file list +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Creating file list $TEST_FILE_LIST for $FILES_COUNT files in $EOS_DIR directory ...\n***" +/usr/bin/rm -f $TEST_FILE_LIST +for i in `seq 1 $FILES_COUNT` +do + echo "$EOS_DIR/testfile-$i" >> $TEST_FILE_LIST +done +/usr/bin/sed -e 's/^/ls -y /' $TEST_FILE_LIST > $TEST_FILE_LIST.eosh # Prepare the file for EOS batch mode + +# Hard coded KEYTAB - this is BAD! +echo "0 u:cta g:cta n:cta-taped N:6425591835858577167 c:1496074683 e:0 f:0 k:b7825f9dd4d72952e429e342dd687aa8735411c29587ddd052613e33c0792e0b" > $EOS_KEYTAB +chmod 400 $EOS_KEYTAB + +# Copy the test file to EOS (many times; in parallel) for archival +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Copying $FILES_COUNT files from $TEST_FILE_LIST file list to EOS into $EOS_DIR directory ...\n***" +/usr/bin/xargs -a $TEST_FILE_LIST -L 1 -P 10 -I {} /usr/bin/xrdcp --nopbar $TEST_FILE_NAME 'root://eosctatape//{}' + +# Wait $TIMEOUT for files until they are migrated +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Waiting until the files are archived on tape (expecting d0::t1 in front of each file entry using the EOS ls -y command), timeout after $TIMEOUT seconds ...\n***" +FILES_TO_BE_ARCHIVED=1 +SECONDS_PASSED=0 +while [ "$FILES_TO_BE_ARCHIVED" -gt 0 ] +do +# FILES_TO_BE_ARCHIVED=`/usr/bin/xargs -a $TEST_FILE_LIST -L 1 -P 10 -I {} /usr/bin/eos ls -y {} | /usr/bin/grep -v '^d0::t1' -c` + FILES_TO_BE_ARCHIVED=`/usr/bin/eos --batch $EOS_MGM_URL $TEST_FILE_LIST.eosh | /usr/bin/grep -v '^d0::t1' -c` + CURRENT_TIME=`/usr/bin/date "+%Y-%m-%d %H:%M:%S"` + echo "$CURRENT_TIME Scanned $EOS_DIR using file list $TEST_FILE_LIST - files to be archived: $FILES_TO_BE_ARCHIVED" + sleep $SLEEP + let SECONDS_PASSED=$SECONDS_PASSED+$SLEEP + if [ "$SECONDS_PASSED" -gt "$TIMEOUT" ] + then + echo "$0: timing out after $TIMEOUT seconds, stopping the test." + exit -1 + fi +done + +# Retrieve files from tape (using here hard coded CTA keytab - THIS IS NOT SAFE !!!) +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Retrieving tape replicas for $FILES_COUNT files in the $EOS_DIR directory from $TEST_FILE_LIST file list onto EOS disk ...\n***" +/usr/bin/xargs -a $TEST_FILE_LIST -I {} echo "{}?eos.ruid=$USER_ID&eos.rgid=$GROUP_ID" | XrdSecPROTOCOL=sss XrdSecSSSKT=$EOS_KEYTAB /usr/bin/xargs -n 5 -P50 /usr/bin/xrdfs eosctatape prepare -s + +# Wait $TIMEOUT for files until they are migrated +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +/usr/bin/echo -e "***\n*** Waiting until the files are retrieved from tape (expecting d1::t1 in front of each file entry using the EOS ls -y command), timeout after $TIMEOUT seconds ...\n***" +FILES_TO_BE_RETRIEVED=1 +SECONDS_PASSED=0 +while [ "$FILES_TO_BE_RETRIEVED" -gt 0 ] +do +# FILES_TO_BE_RETRIEVED=`/usr/bin/xargs -a $TEST_FILE_LIST -L 1 -P 10 -I {} /usr/bin/eos ls -y {} | /usr/bin/grep -v '^d1::t1' -c` + FILES_TO_BE_RETRIEVED=`/usr/bin/eos --batch $EOS_MGM_URL $TEST_FILE_LIST.eosh | /usr/bin/grep '^d0::t1' -c` + CURRENT_TIME=`/usr/bin/date "+%Y-%m-%d %H:%M:%S"` + echo "$CURRENT_TIME Scanned $EOS_DIR using file list $TEST_FILE_LIST - files to be retrieved: $FILES_TO_BE_RETRIEVED" + sleep $SLEEP + let SECONDS_PASSED=$SECONDS_PASSED+$SLEEP + if [ "$SECONDS_PASSED" -gt "$TIMEOUT" ] + then + echo "$0: timing out after $TIMEOUT seconds, stopping the test." + exit -1 + fi +done + +# Removing files from EOS +#/usr/bin/date "+%Y-%m-%d %H:%M:%S" +#/usr/bin/echo -e "***\n*** Removing ALL files from EOS $EOS_DIR directory ...\n***" +#/usr/bin/eos $EOS_MGM_URL rm -r $EOS_DIR + +# Cleanup +/usr/bin/date "+%Y-%m-%d %H:%M:%S" +# /usr/bin/rm -f $TEST_FILE_NAME $TEST_FILE_LIST $TEST_FILE_LIST.eosh $EOS_KEYTAB + +exit 0 diff --git a/xroot_plugins/XrdCtaFile.cpp b/xroot_plugins/XrdCtaFile.cpp index 868eb189cca200805c82b6b32f5599695620cc53..213848637b915835d87ad0bdf62cb23a51f22e2e 100644 --- a/xroot_plugins/XrdCtaFile.cpp +++ b/xroot_plugins/XrdCtaFile.cpp @@ -137,7 +137,8 @@ void XrdCtaFile::authorizeAdmin(){ if(m_protocol!="krb5") { throw cta::exception::Exception(std::string("[ERROR] Admin commands are possible only through Kerberos 5 protocol authentication. Protocol used for this connection: ")+m_protocol); } - m_scheduler->authorizeAdmin(m_cliIdentity); + log::LogContext lc(m_log); + m_scheduler->authorizeAdmin(m_cliIdentity, lc); } //------------------------------------------------------------------------------ @@ -562,6 +563,16 @@ std::string XrdCtaFile::timeToString(const time_t &time) { return timeString; } +//------------------------------------------------------------------------------ +// BytesToMbString +//------------------------------------------------------------------------------ +std::string XrdCtaFile::BytesToMbString(const uint64_t bytes) { + std::ostringstream oss; + const long double mBytes = (long double)bytes/1000.0/1000.0; + oss << std::setprecision(2) << std::fixed << mBytes; + return oss.str(); +} + //------------------------------------------------------------------------------ // formatResponse //------------------------------------------------------------------------------ @@ -1776,6 +1787,7 @@ std::string XrdCtaFile::xCom_test() { // xCom_drive //------------------------------------------------------------------------------ std::string XrdCtaFile::xCom_drive() { + log::LogContext lc(m_log); std::stringstream cmdlineOutput; std::stringstream help; help << m_requestTokens.at(0) << " dr/drive up/down/ls (it is a synchronous command):" << std::endl @@ -1794,7 +1806,7 @@ std::string XrdCtaFile::xCom_drive() { // Here the drive name is required in addition if (m_requestTokens.size() != 4) throw cta::exception::UserError(help.str()); - m_scheduler->setDesiredDriveState(m_cliIdentity, m_requestTokens.at(3), true, false); + m_scheduler->setDesiredDriveState(m_cliIdentity, m_requestTokens.at(3), true, false, lc); cmdlineOutput << "Drive " << m_requestTokens.at(3) << " set UP." << std::endl; } else if ("down" == m_requestTokens.at(2)) { // Parse the command line for option and drive name. @@ -1811,7 +1823,7 @@ std::string XrdCtaFile::xCom_drive() { } // Check if the force option was present. bool force=reply.options.size() && (reply.options.at(0).option == "f"); - m_scheduler->setDesiredDriveState(m_cliIdentity, reply.remainder.at(0), false, force); + m_scheduler->setDesiredDriveState(m_cliIdentity, reply.remainder.at(0), false, force, lc); cmdlineOutput << "Drive " << reply.remainder.at(0) << " set DOWN"; if (force) { cmdlineOutput << " (forced down)"; @@ -1822,12 +1834,12 @@ std::string XrdCtaFile::xCom_drive() { // We will dump all the drives, and select the one asked for if needed. bool singleDrive = (m_requestTokens.size() == 4); bool driveFound = false; - auto driveStates = m_scheduler->getDriveStates(m_cliIdentity); + auto driveStates = m_scheduler->getDriveStates(m_cliIdentity, lc); if (driveStates.size()) { std::vector<std::vector<std::string>> responseTable; - std::vector<std::string> headers = {"drive", "host", "library", "request", - "status", "since", "desired", "vid", "tapepool", "files", "Mbytes", - "speed", "session", "age"}; + std::vector<std::string> headers = {"library", "drive", "host", "desired", "request", + "status", "since", "vid", "tapepool", "files", "MBytes", + "MB/s", "session", "age"}; responseTable.push_back(headers); typedef decltype(*driveStates.begin()) dStateVal; driveStates.sort([](const dStateVal & a, const dStateVal & b){ return a.driveName < b.driveName; }); @@ -1835,9 +1847,10 @@ std::string XrdCtaFile::xCom_drive() { if (singleDrive && m_requestTokens.at(3) != ds.driveName) continue; driveFound = true; std::vector<std::string> currentRow; + currentRow.push_back(ds.logicalLibrary); currentRow.push_back(ds.driveName); currentRow.push_back(ds.host); - currentRow.push_back(ds.logicalLibrary); + currentRow.push_back(ds.desiredDriveState.up?"Up":"Down"); currentRow.push_back(cta::common::dataStructures::toString(ds.mountType)); currentRow.push_back(cta::common::dataStructures::toString(ds.driveStatus)); // print the time spent in the current state @@ -1871,13 +1884,28 @@ std::string XrdCtaFile::xCom_drive() { currentRow.push_back("-"); break; } - currentRow.push_back(ds.desiredDriveState.up?"Up":"Down"); currentRow.push_back(ds.currentVid==""?"-":ds.currentVid); currentRow.push_back(ds.currentTapePool==""?"-":ds.currentTapePool); - currentRow.push_back(std::to_string((unsigned long long)ds.filesTransferredInSession)); - currentRow.push_back(std::to_string((long double)ds.bytesTransferredInSession/Mbytes)); - currentRow.push_back(std::to_string((long double)ds.latestBandwidth/Mbytes)); - currentRow.push_back(std::to_string((unsigned long long)ds.sessionId)); + switch (ds.driveStatus) { + case cta::common::dataStructures::DriveStatus::Transferring: + currentRow.push_back(std::to_string((unsigned long long)ds.filesTransferredInSession)); + currentRow.push_back(BytesToMbString(ds.bytesTransferredInSession)); + currentRow.push_back(BytesToMbString(ds.latestBandwidth)); + break; + default: + currentRow.push_back("-"); + currentRow.push_back("-"); + currentRow.push_back("-"); + } + switch(ds.driveStatus) { + case cta::common::dataStructures::DriveStatus::Up: + case cta::common::dataStructures::DriveStatus::Down: + case cta::common::dataStructures::DriveStatus::Unknown: + currentRow.push_back("-"); + break; + default: + currentRow.push_back(std::to_string((unsigned long long)ds.sessionId)); + } currentRow.push_back(std::to_string((unsigned long long)(time(nullptr)-ds.lastUpdateTime))); responseTable.push_back(currentRow); } @@ -2025,7 +2053,7 @@ std::string XrdCtaFile::xCom_listpendingretrieves() { } //------------------------------------------------------------------------------ -// xCom_listpendingretrieves +// xCom_showqueues //------------------------------------------------------------------------------ std::string XrdCtaFile::xCom_showqueues() { std::stringstream cmdlineOutput; @@ -2035,8 +2063,8 @@ std::string XrdCtaFile::xCom_showqueues() { auto queuesAndMounts=m_scheduler->getQueuesAndMountSummaries(lc); if (queuesAndMounts.size()) { std::vector<std::vector<std::string>> responseTable; - std::vector<std::string> header = {"type","tapepool","vid","files queued","Mbytes queued","oldest age","priority","min age","max drives", - "cur. mounts", "cur. files", "cur. Mbytes", "speed", "next mounts", "tapes capacity", "files on tapes", "Mbytes on tapes", "full tapes", "empty tapes", + std::vector<std::string> header = {"type","tapepool","vid","files queued","MBytes queued","oldest age","priority","min age","max drives", + "cur. mounts", "cur. files", "cur. MBytes", "MB/s", "next mounts", "tapes capacity", "files on tapes", "MBytes on tapes", "full tapes", "empty tapes", "disabled tapes", "writables tapes"}; if(hasOption("-h", "--header")) responseTable.push_back(header); for (auto & q: queuesAndMounts) { @@ -2045,7 +2073,7 @@ std::string XrdCtaFile::xCom_showqueues() { currentRow.push_back(q.tapePool); currentRow.push_back(q.vid); currentRow.push_back(std::to_string(q.filesQueued)); - currentRow.push_back(std::to_string((long double)q.bytesQueued/Mbytes)); + currentRow.push_back(BytesToMbString(q.bytesQueued)); currentRow.push_back(std::to_string(q.oldestJobAge)); if (common::dataStructures::MountType::Archive == q.mountType) { currentRow.push_back(std::to_string(q.mountPolicy.archivePriority)); @@ -2062,12 +2090,12 @@ std::string XrdCtaFile::xCom_showqueues() { } currentRow.push_back(std::to_string(q.currentMounts)); currentRow.push_back(std::to_string(q.currentFiles)); - currentRow.push_back(std::to_string((long double)q.currentBytes/Mbytes)); - currentRow.push_back(std::to_string((long double)q.latestBandwidth/Mbytes)); + currentRow.push_back(BytesToMbString(q.currentBytes)); + currentRow.push_back(BytesToMbString(q.latestBandwidth)); currentRow.push_back(std::to_string(q.nextMounts)); - currentRow.push_back(std::to_string(q.tapesCapacity/(uint64_t)Mbytes)); + currentRow.push_back(std::to_string(q.tapesCapacity/(uint64_t)MBytes)); currentRow.push_back(std::to_string(q.filesOnTapes)); - currentRow.push_back(std::to_string((long double)q.dataOnTapes/Mbytes)); + currentRow.push_back(BytesToMbString(q.dataOnTapes)); currentRow.push_back(std::to_string(q.fullTapes)); currentRow.push_back(std::to_string(q.emptyTapes)); currentRow.push_back(std::to_string(q.disabledTapes)); @@ -2187,12 +2215,10 @@ std::string XrdCtaFile::xCom_deletearchive() { cta::common::dataStructures::DeleteArchiveRequest request; request.archiveFileID=id.value(); request.requester=originator; - cta::utils::Timer t; - m_scheduler->deleteArchive(m_cliIdentity.username, request); log::LogContext lc(m_log); + m_scheduler->deleteArchive(m_cliIdentity.username, request, lc); log::ScopedParamContainer params(lc); - params.add("fileId", request.archiveFileID) - .add("catalogueTime", t.secs()); + params.add("fileId", request.archiveFileID); lc.log(log::INFO, "In XrdCtaFile::xCom_deletearchive(): deleted archive file."); return ""; } diff --git a/xroot_plugins/XrdCtaFile.hpp b/xroot_plugins/XrdCtaFile.hpp index f7e31c9d2db631e2d41d0661daa3a8d8a3bf60f9..cef1f0cca3a74c09e50bc93c52441403ae4659b0 100644 --- a/xroot_plugins/XrdCtaFile.hpp +++ b/xroot_plugins/XrdCtaFile.hpp @@ -379,6 +379,14 @@ protected: */ std::string timeToString(const time_t &time); + /** + * Returns a string representation for bytes in Mbytes with .00 precision + * + * @param bytes The input number of bytes + * @return a string representation in Mbytes + */ + std::string BytesToMbString(const uint64_t bytes); + /** * Adds the creation log and the last modification log to the current response row * @@ -456,5 +464,5 @@ protected: optional<std::string> EOS2CTAChecksumValue(const optional<std::string> &EOSChecksumValue); }; -const long double Mbytes = 1000.0*1000.0; +const long double MBytes = 1000.0*1000.0; }} diff --git a/xroot_plugins/XrdCtaFilesystem.cpp b/xroot_plugins/XrdCtaFilesystem.cpp index 3d000daf575354b8576c66729a70065bce413d31..336dc8eb5d8bb60eb9b746e427ed0f6120a13e29 100644 --- a/xroot_plugins/XrdCtaFilesystem.cpp +++ b/xroot_plugins/XrdCtaFilesystem.cpp @@ -89,6 +89,15 @@ XrdSfsDirectory * XrdCtaFilesystem::newDir(char *user, int MonID) return new cta::xrootPlugins::XrdCtaDir(m_catalogue.get(), m_log.get(), user, MonID);; } +//------------------------------------------------------------------------------ +// FSctl +//------------------------------------------------------------------------------ +int XrdCtaFilesystem::FSctl(const int cmd, XrdSfsFSctl &args, XrdOucErrInfo &eInfo, const XrdSecEntity *client) { + (void)cmd; (void)args; (void)eInfo; (void)client; + eInfo.setErrInfo(ENOTSUP, "Not supported."); + return SFS_ERROR; +} + //------------------------------------------------------------------------------ // fsctl //------------------------------------------------------------------------------ diff --git a/xroot_plugins/XrdCtaFilesystem.hpp b/xroot_plugins/XrdCtaFilesystem.hpp index 2fd9504488af9df8835d434dc47ab003a8b569cf..3d71292aaa0705ed1affab40b0ccfdc20864ba40 100644 --- a/xroot_plugins/XrdCtaFilesystem.hpp +++ b/xroot_plugins/XrdCtaFilesystem.hpp @@ -172,6 +172,7 @@ protected: xbuf->SetLen(dataSize); return UniqueXrdOucBuffer(xbuf); } + }; // XrdCtaFilesystem }}