From 63506f23e4ffa89ea3a77f6cf4e460cd2f0a94be Mon Sep 17 00:00:00 2001 From: Cedric CAFFY <cedric.caffy@cern.ch> Date: Thu, 20 Feb 2020 16:22:50 +0100 Subject: [PATCH] Corrected race condition when the disk threads finished after the tape is unmounted. If the disk threads are not finished after the TapeReadSingleThread finished, the status of the drive will be "DrainingToDisk" This removed a RecallReportPacker exception when trying to release disk space reservation for backpressure --- scheduler/OStoreDB/OStoreDB.cpp | 2 +- .../tape/tapeserver/daemon/DiskWriteTaskTest.cpp | 4 ++++ .../tape/tapeserver/daemon/DiskWriteThreadPool.cpp | 2 -- .../tape/tapeserver/daemon/RecallReportPacker.cpp | 12 ++++++++++-- .../tapeserver/daemon/RecallReportPackerTest.cpp | 6 ------ .../tape/tapeserver/daemon/TapeReadSingleThread.cpp | 4 ++-- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/scheduler/OStoreDB/OStoreDB.cpp b/scheduler/OStoreDB/OStoreDB.cpp index 78dd4916e1..722c1a238a 100644 --- a/scheduler/OStoreDB/OStoreDB.cpp +++ b/scheduler/OStoreDB/OStoreDB.cpp @@ -2978,6 +2978,7 @@ void OStoreDB::updateDriveStatus(const common::dataStructures::DriveInfo& driveI log::ScopedParamContainer params(lc); params.add("diskSystem", dr.first) .add("bytes", dr.second) + .add("previousStatus",toString(previousStatus)) .add("newStatus", toString(driveState.driveStatus)); lc.log(log::WARNING, "In OStoreDB::updateDriveStatus(): will clear non-empty disk space reservation on status change."); } @@ -3901,7 +3902,6 @@ void OStoreDB::RetrieveMount::reserveDiskSpace(const DiskSpaceReservationRequest .add("objectName", ds.getAddressIfSet()); lc.log(log::DEBUG, "In RetrieveMount::reserveDiskSpace(): state after reservation."); } - //TODO BUG HERE, see mountId=\"10\" in downloads/tpsrv0* ds.commit(); } diff --git a/tapeserver/castor/tape/tapeserver/daemon/DiskWriteTaskTest.cpp b/tapeserver/castor/tape/tapeserver/daemon/DiskWriteTaskTest.cpp index a49fc4bb65..874130eebf 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/DiskWriteTaskTest.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/DiskWriteTaskTest.cpp @@ -86,6 +86,10 @@ namespace unitTests{ cta::threading::MutexLocker ml(m_mutex); endSessionsWithError++; } + void setDiskDone() override {} + + void setTapeDone() override {} + MockRecallReportPacker(cta::RetrieveMount *rm,cta::log::LogContext lc): RecallReportPacker(rm,lc), completeJobs(0), failedJobs(0), endSessions(0), endSessionsWithError(0) {} diff --git a/tapeserver/castor/tape/tapeserver/daemon/DiskWriteThreadPool.cpp b/tapeserver/castor/tape/tapeserver/daemon/DiskWriteThreadPool.cpp index b343526488..323307f0c9 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/DiskWriteThreadPool.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/DiskWriteThreadPool.cpp @@ -178,8 +178,6 @@ void DiskWriteThreadPool::DiskWriteWorkerThread::run() { logWithStat(cta::log::INFO, "Finishing DiskWriteWorkerThread"); m_parentThreadPool.addThreadStats(m_threadStat); if(0 == --m_parentThreadPool.m_nbActiveThread){ - m_parentThreadPool.m_reporter.setDiskDone(); - m_parentThreadPool.m_reporter.setTapeDone(); //Im the last Thread alive, report end of session if(m_parentThreadPool.m_failedWriteCount==0){ diff --git a/tapeserver/castor/tape/tapeserver/daemon/RecallReportPacker.cpp b/tapeserver/castor/tape/tapeserver/daemon/RecallReportPacker.cpp index 2e0ddfcded..c59172cfbf 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/RecallReportPacker.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/RecallReportPacker.cpp @@ -131,6 +131,7 @@ void RecallReportPacker::ReportSuccessful::execute(RecallReportPacker& parent){ //ReportEndofSession::execute //------------------------------------------------------------------------------ void RecallReportPacker::ReportEndofSession::execute(RecallReportPacker& parent){ + parent.setDiskDone(); if(!parent.errorHappened()){ parent.m_lc.log(cta::log::INFO,"Nominal RecallReportPacker::EndofSession has been reported"); if (parent.m_watchdog) { @@ -167,7 +168,6 @@ bool RecallReportPacker::ReportEndofSession::goingToEnd() { void RecallReportPacker::ReportDriveStatus::execute(RecallReportPacker& parent){ parent.m_retrieveMount->setDriveStatus(m_status); if(m_status==cta::common::dataStructures::DriveStatus::Unmounting) { - parent.m_retrieveMount->diskComplete(); parent.m_retrieveMount->tapeComplete(); } } @@ -183,6 +183,7 @@ bool RecallReportPacker::ReportDriveStatus::goingToEnd() { //ReportEndofSessionWithErrors::execute //------------------------------------------------------------------------------ void RecallReportPacker::ReportEndofSessionWithErrors::execute(RecallReportPacker& parent){ + parent.setDiskDone(); if(parent.m_errorHappened) { LogContext::ScopedParam(parent.m_lc,Param("errorCode",m_error_code)); parent.m_lc.log(cta::log::ERR,m_message); @@ -320,7 +321,13 @@ void RecallReportPacker::WorkerThread::run(){ // Make sure the last batch of reports got cleaned up. try { - m_parent.fullCheckAndFinishAsyncExecute(); + m_parent.fullCheckAndFinishAsyncExecute(); + if(m_parent.m_diskThreadComplete){ + //The m_parent.m_diskThreadComplete is set to true when a ReportEndOfSession or a ReportAndOfSessionWithError + //has been put. It is only after the fullCheckandFinishAsyncExecute is finished that we can say to the mount that the disk thread is complete. + m_parent.m_lc.log(cta::log::DEBUG, "In RecallReportPacker::WorkerThread::run(): all disk threads are finished, telling the mount that Disk threads are complete"); + m_parent.m_retrieveMount->diskComplete(); + } } catch(const cta::exception::Exception& e){ cta::log::ScopedParamContainer params(m_parent.m_lc); params.add("exceptionWhat", e.getMessageValue()) @@ -389,6 +396,7 @@ void RecallReportPacker::fullCheckAndFinishAsyncExecute() { //------------------------------------------------------------------------------ void RecallReportPacker::setTapeDone() { m_tapeThreadComplete = true; + m_retrieveMount->tapeComplete(); } //------------------------------------------------------------------------------ diff --git a/tapeserver/castor/tape/tapeserver/daemon/RecallReportPackerTest.cpp b/tapeserver/castor/tape/tapeserver/daemon/RecallReportPackerTest.cpp index 683a813e19..2ac1a4416d 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/RecallReportPackerTest.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/RecallReportPackerTest.cpp @@ -95,9 +95,6 @@ TEST_F(castor_tape_tapeserver_daemon_RecallReportPackerTest, RecallReportPackerN rrp.reportCompletedJob(std::move(job1)); rrp.reportCompletedJob(std::move(job2)); - rrp.setDiskDone(); - rrp.setTapeDone(); - rrp.reportDriveStatus(cta::common::dataStructures::DriveStatus::Unmounting); rrp.reportEndOfSession(); @@ -151,9 +148,6 @@ TEST_F(castor_tape_tapeserver_daemon_RecallReportPackerTest, RecallReportPackerB const cta::exception::Exception ex(error_msg); rrp.reportFailedJob(std::move(job3), ex); - rrp.setDiskDone(); - rrp.setTapeDone(); - rrp.reportDriveStatus(cta::common::dataStructures::DriveStatus::Unmounting); rrp.reportEndOfSession(); diff --git a/tapeserver/castor/tape/tapeserver/daemon/TapeReadSingleThread.cpp b/tapeserver/castor/tape/tapeserver/daemon/TapeReadSingleThread.cpp index 7ca950f561..2fd34e9750 100644 --- a/tapeserver/castor/tape/tapeserver/daemon/TapeReadSingleThread.cpp +++ b/tapeserver/castor/tape/tapeserver/daemon/TapeReadSingleThread.cpp @@ -111,11 +111,10 @@ castor::tape::tapeserver::daemon::TapeReadSingleThread::TapeCleaning::~TapeClean // And return the tape to the library // In case of manual mode, this will be filtered by the rmc daemon // (which will do nothing) - currentErrorToCount = "Error_tapeDismount"; + currentErrorToCount = "Error_tapeDismount"; m_this.m_rrp.reportDriveStatus(cta::common::dataStructures::DriveStatus::Unmounting); m_this.m_mc.dismountTape(m_this.m_volInfo.vid, m_this.m_drive.config.librarySlot()); m_this.m_drive.disableLogicalBlockProtection(); - m_this.m_rrp.reportDriveStatus(cta::common::dataStructures::DriveStatus::Up); m_this.m_stats.unmountTime += m_timer.secs(cta::utils::Timer::resetCounter); m_this.m_logContext.log(cta::log::INFO, cta::mediachanger::TAPE_LIBRARY_TYPE_MANUAL != m_this.m_drive.config.librarySlot().getLibraryType() ? "TapeReadSingleThread : tape unmounted":"TapeReadSingleThread : tape NOT unmounted (manual mode)"); @@ -356,6 +355,7 @@ void castor::tape::tapeserver::daemon::TapeReadSingleThread::run() { cta::log::ScopedParamContainer params(m_logContext); params.add("status", "success"); m_stats.totalTime = totalTimer.secs(); + m_rrp.setTapeDone(); logWithStat(cta::log::INFO, "Tape thread complete", params); // Report one last time the stats, after unloading/unmounting. -- GitLab