From 99c2b5696a3e0113070a51e20ab803b91c1a4f2b Mon Sep 17 00:00:00 2001 From: Cedric CAFFY <cedric.caffy@cern.ch> Date: Mon, 12 Aug 2019 17:48:01 +0200 Subject: [PATCH] [REPACK] Added a failed to create archive request counter so that the Repack Request is complete or failed if Retrieve subrequest is failed (creation or execution) --- cmdline/CtaAdminTextFormatter.cpp | 2 +- objectstore/RepackRequest.cpp | 26 ++++++++++++++++++++----- objectstore/RepackRequest.hpp | 4 +++- objectstore/cta.proto | 1 + scheduler/OStoreDB/OStoreDB.cpp | 32 +++++++++++++++++-------------- 5 files changed, 44 insertions(+), 21 deletions(-) diff --git a/cmdline/CtaAdminTextFormatter.cpp b/cmdline/CtaAdminTextFormatter.cpp index 86d104242e..0d9b54ccb5 100644 --- a/cmdline/CtaAdminTextFormatter.cpp +++ b/cmdline/CtaAdminTextFormatter.cpp @@ -634,7 +634,7 @@ void TextFormatter::print(const RepackLsItem &rels_item) { rels_item.failed_to_retrieve_files(), dataSizeToStr(rels_item.failed_to_retrieve_bytes()), rels_item.failed_to_archive_files(), - dataSizeToStr(rels_item.failed_to_retrieve_bytes()), + dataSizeToStr(rels_item.failed_to_archive_bytes()), rels_item.last_expanded_fseq(), rels_item.status() ); diff --git a/objectstore/RepackRequest.cpp b/objectstore/RepackRequest.cpp index b7bb2a6d9f..3f3714fbd6 100644 --- a/objectstore/RepackRequest.cpp +++ b/objectstore/RepackRequest.cpp @@ -72,6 +72,7 @@ void RepackRequest::initialize() { m_payload.set_archivedbytes(0); m_payload.set_failedtoretrievefiles(0); m_payload.set_failedtoretrievebytes(0); + m_payload.set_failedtocreatearchivereq(0); m_payload.set_failedtoarchivefiles(0); m_payload.set_failedtoarchivebytes(0); m_payload.set_lastexpandedfseq(0); @@ -196,9 +197,9 @@ void RepackRequest::setStatus(){ checkPayloadReadable(); if(m_payload.is_expand_started()){ - //The expansion of the Repack Request have started + //The expansion of the Repack Request have started if(m_payload.is_expand_finished()){ - if( (m_payload.retrievedfiles() + m_payload.failedtoretrievefiles() >= m_payload.totalfilestoretrieve()) && (m_payload.archivedfiles() + m_payload.failedtoarchivefiles() >= m_payload.totalfilestoarchive()) ){ + if( (m_payload.retrievedfiles() + m_payload.failedtoretrievefiles() >= m_payload.totalfilestoretrieve()) && (m_payload.archivedfiles() + m_payload.failedtoarchivefiles() + m_payload.failedtocreatearchivereq() >= m_payload.totalfilestoarchive()) ){ //We reached the end if (m_payload.failedtoretrievefiles() || m_payload.failedtoarchivefiles()) { //At least one retrieve or archive has failed @@ -520,13 +521,28 @@ auto RepackRequest::getStats() -> std::map<StatsType, StatsValues> { //------------------------------------------------------------------------------ // RepackRequest::reportRetrieveCreationFailures() //------------------------------------------------------------------------------ -void RepackRequest::reportRetrieveCreationFailures(const StatsValues& failedRetrieveCreation){ +void RepackRequest::reportRetrieveCreationFailures(const std::list<cta::SchedulerDatabase::RepackRequest::Subrequest>& notCreatedSubrequests){ checkPayloadWritable(); - m_payload.set_failedtoretrievebytes(m_payload.failedtoretrievebytes() + failedRetrieveCreation.bytes); - m_payload.set_failedtoretrievefiles(m_payload.failedtoretrievefiles() + failedRetrieveCreation.files); + uint64_t failedToRetrieveFiles, failedToRetrieveBytes, failedToCreateArchiveReq = 0; + for(auto & subreq: notCreatedSubrequests){ + failedToRetrieveFiles++; + failedToRetrieveBytes+=subreq.archiveFile.fileSize; + for(auto & copyNb: subreq.copyNbsToRearchive){ + (void) copyNb; + failedToCreateArchiveReq++; + } + } + m_payload.set_failedtoretrievebytes(m_payload.failedtoretrievebytes() + failedToRetrieveBytes); + m_payload.set_failedtoretrievefiles(m_payload.failedtoretrievefiles() + failedToRetrieveFiles); + reportArchiveCreationFailures(failedToCreateArchiveReq); setStatus(); } +void RepackRequest::reportArchiveCreationFailures(uint64_t nbFailedToCreateArchiveRequests){ + checkPayloadWritable(); + m_payload.set_failedtocreatearchivereq(m_payload.failedtocreatearchivereq() + nbFailedToCreateArchiveRequests); +} + //------------------------------------------------------------------------------ // RepackRequest::garbageCollect() //------------------------------------------------------------------------------ diff --git a/objectstore/RepackRequest.hpp b/objectstore/RepackRequest.hpp index 70dac1f871..bfa7c812d8 100644 --- a/objectstore/RepackRequest.hpp +++ b/objectstore/RepackRequest.hpp @@ -126,7 +126,9 @@ public: }; std::map<StatsType, StatsValues> getStats(); - void reportRetrieveCreationFailures(const StatsValues &failedRetrieveCreated); + void reportRetrieveCreationFailures(const std::list<cta::SchedulerDatabase::RepackRequest::Subrequest>& notCreatedSubrequests); + + void reportArchiveCreationFailures(uint64_t nbFailedToCreateArchiveRequests); void garbageCollect(const std::string &presumedOwner, AgentReference & agentReference, log::LogContext & lc, cta::catalogue::Catalogue & catalogue) override; diff --git a/objectstore/cta.proto b/objectstore/cta.proto index 55a327cac4..734e0fb1b4 100644 --- a/objectstore/cta.proto +++ b/objectstore/cta.proto @@ -568,6 +568,7 @@ message RepackRequest { required uint64 archivedbytes = 11510; required uint64 failedtoretrievefiles = 11520; required uint64 failedtoretrievebytes = 11530; + required uint64 failedtocreatearchivereq = 11535; required uint64 failedtoarchivefiles = 11540; required uint64 failedtoarchivebytes = 11550; required uint64 lastexpandedfseq = 11560; diff --git a/scheduler/OStoreDB/OStoreDB.cpp b/scheduler/OStoreDB/OStoreDB.cpp index 9c30521bea..8c95269d4e 100644 --- a/scheduler/OStoreDB/OStoreDB.cpp +++ b/scheduler/OStoreDB/OStoreDB.cpp @@ -2086,12 +2086,17 @@ void OStoreDB::RepackRetrieveFailureReportBatch::report(log::LogContext& lc){ { // Prepare the report objectstore::RepackRequest::SubrequestStatistics::List ssl; + uint64_t failedToCreateArchiveReq = 0; for (auto &rr: m_subrequestList) { ssl.push_back(objectstore::RepackRequest::SubrequestStatistics()); ssl.back().bytes = rr.archiveFile.fileSize; ssl.back().files = 1; ssl.back().fSeq = rr.repackInfo.fSeq; fSeqsToDelete.push_back(rr.repackInfo.fSeq); + for(auto& copyNb: rr.repackInfo.copyNbsToRearchive){ + (void) copyNb; + failedToCreateArchiveReq++; + } } // Record it. timingList.insertAndReset("failureStatsPrepareTime", t); @@ -2101,6 +2106,8 @@ void OStoreDB::RepackRetrieveFailureReportBatch::report(log::LogContext& lc){ timingList.insertAndReset("failureStatsFetchTime", t); m_repackRequest.reportSubRequestsForDeletion(fSeqsToDelete); timingList.insertAndReset("failureStatsReportSubRequestsForDeletionTime", t); + m_repackRequest.reportArchiveCreationFailures(failedToCreateArchiveReq); + timingList.insertAndReset("failureArchiveCreationStatsUpdateTime",t); m_repackRequest.reportRetriveFailures(ssl); timingList.insertAndReset("failureStatsUpdateTime", t); m_repackRequest.commit(); @@ -2198,10 +2205,8 @@ void OStoreDB::RepackRequest::addSubrequestsAndUpdateStats(std::list<Subrequest> for (auto &rn: subrequestsNames) { subReqInfoMap[rn.fSeq] = rn; } // Try to create the retrieve subrequests (owned by this process, to be queued in a second step) // subrequests can already fail at that point if we cannot find a copy on a valid tape. - std::list<uint64_t> failedFSeqs; + std::list<Subrequest> notCreatedSubrequests; objectstore::RepackRequest::StatsValues failedCreationStats; - uint64_t failedFiles = 0; - uint64_t failedBytes = 0; // First loop: we will issue the async insertions of the subrequests. struct AsyncInsertionInfo { Subrequest & rsr; @@ -2233,9 +2238,9 @@ void OStoreDB::RepackRequest::addSubrequestsAndUpdateStats(std::list<Subrequest> rRRepackInfo.archiveRouteMap[ar.second.copyNb] = ar.second.tapePoolName; } } catch (std::out_of_range &) { - failedFSeqs.emplace_back(rsr.fSeq); + notCreatedSubrequests.emplace_back(rsr); failedCreationStats.files++; - failedCreationStats.bytes += rsr.archiveFile.fileSize; + failedCreationStats.bytes+=rsr.archiveFile.fileSize; log::ScopedParamContainer params(lc); params.add("fileID", rsr.archiveFile.archiveFileID) .add("diskInstance", rsr.archiveFile.diskInstance) @@ -2286,9 +2291,9 @@ void OStoreDB::RepackRequest::addSubrequestsAndUpdateStats(std::list<Subrequest> bestVid = Helpers::selectBestRetrieveQueue(candidateVids, m_oStoreDB.m_catalogue, m_oStoreDB.m_objectStore); } catch (Helpers::NoTapeAvailableForRetrieve &) { // Count the failure for this subrequest. - failedFSeqs.emplace_back(rsr.fSeq); - failedFiles++; - failedBytes += rsr.archiveFile.fileSize; + notCreatedSubrequests.emplace_back(rsr); + failedCreationStats.files++; + failedCreationStats.bytes += rsr.archiveFile.fileSize; log::ScopedParamContainer params(lc); params.add("fileId", rsr.archiveFile.archiveFileID) .add("repackVid", repackInfo.vid); @@ -2304,7 +2309,7 @@ void OStoreDB::RepackRequest::addSubrequestsAndUpdateStats(std::list<Subrequest> } { // Count the failure for this subrequest. - failedFSeqs.emplace_back(rsr.fSeq); + notCreatedSubrequests.emplace_back(rsr); failedCreationStats.files++; failedCreationStats.bytes += rsr.archiveFile.fileSize; log::ScopedParamContainer params(lc); @@ -2336,10 +2341,9 @@ void OStoreDB::RepackRequest::addSubrequestsAndUpdateStats(std::list<Subrequest> } catch (exception::Exception & ex) { // We can fail to serialize here... // Count the failure for this subrequest. - failedFSeqs.emplace_back(rsr.fSeq); + notCreatedSubrequests.emplace_back(rsr); failedCreationStats.files++; failedCreationStats.bytes += rsr.archiveFile.fileSize; - failedFSeqs.emplace_back(rsr.fSeq); log::ScopedParamContainer params(lc); params.add("fileId", rsr.archiveFile.archiveFileID) .add("repackVid", repackInfo.vid) @@ -2374,7 +2378,7 @@ void OStoreDB::RepackRequest::addSubrequestsAndUpdateStats(std::list<Subrequest> asyncInsertedSubrequestInfoList.emplace_back(AsyncInsertedSubrequestInfo{aii.rsr, aii.bestVid, aii.activeCopyNb, aii.request}); } catch (exception::Exception & ex) { // Count the failure for this subrequest. - failedFSeqs.emplace_back(aii.rsr.fSeq); + notCreatedSubrequests.emplace_back(aii.rsr); failedCreationStats.files++; failedCreationStats.bytes += aii.rsr.archiveFile.fileSize; log::ScopedParamContainer params(lc); @@ -2387,11 +2391,11 @@ void OStoreDB::RepackRequest::addSubrequestsAndUpdateStats(std::list<Subrequest> "In OStoreDB::RepackRequest::addSubrequests(): could not asyncInsert the subrequest."); } } - if(failedFSeqs.size()){ + if(notCreatedSubrequests.size()){ log::ScopedParamContainer params(lc); params.add("files", failedCreationStats.files); params.add("bytes", failedCreationStats.bytes); - m_repackRequest.reportRetrieveCreationFailures(failedCreationStats); + m_repackRequest.reportRetrieveCreationFailures(notCreatedSubrequests); m_repackRequest.commit(); lc.log(log::ERR, "In OStoreDB::RepackRequest::addSubRequests(), reported the failed creation of Retrieve Requests to the Repack request"); } -- GitLab