diff --git a/castor/tape/tapeserver/daemon/CatalogueDrive.cpp b/castor/tape/tapeserver/daemon/CatalogueDrive.cpp index 36d2ec1d6eb5f842c1fd17549560aeb0c7754b85..40bd38b76507e7c98e84131664ecb5dd3d70cc39 100644 --- a/castor/tape/tapeserver/daemon/CatalogueDrive.cpp +++ b/castor/tape/tapeserver/daemon/CatalogueDrive.cpp @@ -564,6 +564,7 @@ void castor::tape::tapeserver::daemon::CatalogueDrive:: session->sessionSucceeded(); m_vdqm.setDriveDown(m_hostName, m_config.unitName, m_config.dgn); break; + case DRIVE_STATE_WAITSHUTDOWNKILL: case DRIVE_STATE_WAITSHUTDOWNCLEANER: changeState(DRIVE_STATE_SHUTDOWN); session->sessionSucceeded(); @@ -588,7 +589,33 @@ void castor::tape::tapeserver::daemon::CatalogueDrive::sessionFailed() { switch(m_state) { case DRIVE_STATE_RUNNING: case DRIVE_STATE_WAITDOWN: + case DRIVE_STATE_WAITSHUTDOWNKILL: return runningSessionFailed(); + case DRIVE_STATE_WAITSHUTDOWNCLEANER: + return cleanerOfShutdownFailed(); + default: + { + castor::exception::Exception ex; + ex.getMessage() << + "Failed to record tape session failed for session with pid " << + getSession().getPid() << ": Incompatible drive state: state=" << + driveStateToStr(m_state); + delete m_session; + m_session = NULL; + throw ex; + } + } +} + +//----------------------------------------------------------------------------- +// sessionKilled +//----------------------------------------------------------------------------- +void castor::tape::tapeserver::daemon::CatalogueDrive:: +sessionKilled(uint32_t signal) { + switch(m_state) { + case DRIVE_STATE_RUNNING: + case DRIVE_STATE_WAITDOWN: + return runningSessionKilled(signal); case DRIVE_STATE_WAITSHUTDOWNKILL: return sessionKilledByShutdown(); case DRIVE_STATE_WAITSHUTDOWNCLEANER: @@ -614,6 +641,18 @@ void castor::tape::tapeserver::daemon::CatalogueDrive::runningSessionFailed() { std::auto_ptr<CatalogueSession> session(m_session); m_session = NULL; session->sessionFailed(); + changeState(DRIVE_STATE_DOWN); + m_vdqm.setDriveDown(m_hostName, m_config.unitName, m_config.dgn); +} + +//------------------------------------------------------------------------------ +// runningSessionKilled +//------------------------------------------------------------------------------ +void castor::tape::tapeserver::daemon::CatalogueDrive:: +runningSessionKilled(uint32_t signal) { + std::auto_ptr<CatalogueSession> session(m_session); + m_session = NULL; + session->sessionKilled(signal); if(CatalogueSession::SESSION_TYPE_CLEANER != session->getType()) { const uint32_t driveReadyDelayInSeconds = 60; @@ -625,6 +664,7 @@ void castor::tape::tapeserver::daemon::CatalogueDrive::runningSessionFailed() { } } + //------------------------------------------------------------------------------ // sessionKilledByShutdown //------------------------------------------------------------------------------ diff --git a/castor/tape/tapeserver/daemon/CatalogueDrive.hpp b/castor/tape/tapeserver/daemon/CatalogueDrive.hpp index 62aba18b9fcc8c7d995486813f9344ef2f2fcbcc..6e845ce3c268da40f0d40fa17ca27c585a103405 100644 --- a/castor/tape/tapeserver/daemon/CatalogueDrive.hpp +++ b/castor/tape/tapeserver/daemon/CatalogueDrive.hpp @@ -331,9 +331,16 @@ public: void sessionSucceeded(); /** - * Informs catalogue drive that the current tape session has failed. + * Informs catalogue drive that the current tape session completed in a failed + * state (tape not unloaded, usually) and the drive should be marked as down. */ void sessionFailed(); + + /** + * Informs catalogue drive that the current tape session was killed (either + * due to a hang, of for other reason, like a dissatisfied operator.) + */ + void sessionKilled(uint32_t signal); /** * Informs catalogue drive that the current tape session has failed and that @@ -636,9 +643,15 @@ private: /** * Called when a running session (DRIVE_STATE_RUNNING or DRIVE_STATE_WAITDOWN) - * has failed. + * has competed with an error return value. */ void runningSessionFailed(); + + /** + * Called when a running session (DRIVE_STATE_RUNNING or DRIVE_STATE_WAITDOWN) + * has been killed. + */ + void runningSessionKilled(uint32_t signal); /** * Called when a running session has been intentionally killed by the shutdown diff --git a/castor/tape/tapeserver/daemon/CatalogueSession.hpp b/castor/tape/tapeserver/daemon/CatalogueSession.hpp index f0814314b3dad43cd09f851541b12c704d8b9f6e..12e268b0049842757fb8412777bcc64f3874788b 100644 --- a/castor/tape/tapeserver/daemon/CatalogueSession.hpp +++ b/castor/tape/tapeserver/daemon/CatalogueSession.hpp @@ -29,6 +29,7 @@ #include <string> #include <sys/types.h> #include <unistd.h> +#include <stdint.h> namespace castor { namespace tape { @@ -91,6 +92,13 @@ public: * To be called when the session has ended with failure. */ virtual void sessionFailed() = 0; + + /** + * To be called when a session is killed. + * This member function is a no-op by default. (overridden by + * CatalogueTransferSession at the time of writing) + */ + virtual void sessionKilled(uint32_t signal) {} /** * Gets the volume identifier of the tape associated with the tape drive. diff --git a/castor/tape/tapeserver/daemon/CatalogueTransferSession.cpp b/castor/tape/tapeserver/daemon/CatalogueTransferSession.cpp index cc40d1d49f28e1440e78a95a7d74e00d76db1998..05ef4ee0330e481aea52bea2b6ed6561fb4b9d0f 100644 --- a/castor/tape/tapeserver/daemon/CatalogueTransferSession.cpp +++ b/castor/tape/tapeserver/daemon/CatalogueTransferSession.cpp @@ -141,6 +141,8 @@ bool castor::tape::tapeserver::daemon::CatalogueTransferSession:: m_log(LOG_ERR, "Killing data-transfer session because transfer job is too late", params); + // We will also attach the error to final log for the session + addLogParam(castor::log::Param("Error_timeoutGettingJobInfo","1")); try { idempotentKill(m_pid, SIGKILL); @@ -191,7 +193,9 @@ bool castor::tape::tapeserver::daemon::CatalogueTransferSession:: m_log(LOG_ERR, "Killing data-transfer session because tape mount is taking too long", params); - + // We will also attach the error to final log for the session + addLogParam(castor::log::Param("Error_timeoutMountingTape","1")); + try { idempotentKill(m_pid, SIGKILL); m_state = WAIT_TIMEOUT_KILL; @@ -222,6 +226,8 @@ bool castor::tape::tapeserver::daemon::CatalogueTransferSession:: m_log(LOG_ERR, "Killing data-transfer session because data blocks are not being moved", params); + // We will also attach the error to final log for the session + addLogParam(castor::log::Param("Error_sessionHung","1")); try { idempotentKill(m_pid, SIGKILL); @@ -263,6 +269,18 @@ void castor::tape::tapeserver::daemon::CatalogueTransferSession:: m_sessionLogContext.log(LOG_INFO, "Tape session finished"); } +//------------------------------------------------------------------------------ +// sessionFailed +//------------------------------------------------------------------------------ +void castor::tape::tapeserver::daemon::CatalogueTransferSession:: + sessionKilled(uint32_t signal) { + // In case of problem, we mark the session failed ourselves + m_sessionLogContext.pushOrReplace(log::Param("Error_sessionKilled","1")); + m_sessionLogContext.pushOrReplace(log::Param("killSignal",signal)); + m_sessionLogContext.pushOrReplace(log::Param("status","failure")); + m_sessionLogContext.log(LOG_INFO, "Tape session finished"); +} + //------------------------------------------------------------------------------ // getAssignmentTime //------------------------------------------------------------------------------ diff --git a/castor/tape/tapeserver/daemon/CatalogueTransferSession.hpp b/castor/tape/tapeserver/daemon/CatalogueTransferSession.hpp index f87f36142a0209ea02a100e4dceb1e5b48330208..b437e194cfa3bbb04f6fca06468254d6bace61b7 100644 --- a/castor/tape/tapeserver/daemon/CatalogueTransferSession.hpp +++ b/castor/tape/tapeserver/daemon/CatalogueTransferSession.hpp @@ -101,6 +101,11 @@ public: * To be called when the session has ended with failure. */ void sessionFailed(); + + /** + * To be called when the session has ended with a kill. + */ + virtual void sessionKilled(uint32_t signal); /** * Gets the time at which the tape drive was assigned a data transfer job. diff --git a/castor/tape/tapeserver/daemon/MigrationReportPacker.cpp b/castor/tape/tapeserver/daemon/MigrationReportPacker.cpp index 4f6cc33cfe7f97b93dbdfb8608f3b57580a06779..eb393e364e1fff544847cbbce4883386dab3a0d6 100644 --- a/castor/tape/tapeserver/daemon/MigrationReportPacker.cpp +++ b/castor/tape/tapeserver/daemon/MigrationReportPacker.cpp @@ -249,6 +249,10 @@ void MigrationReportPacker::ReportEndofSession::execute(MigrationReportPacker& r reportPacker.m_lc.log(LOG_INFO,"Reported end of session to client"); if(reportPacker.m_watchdog) { reportPacker.m_watchdog->addParameter(log::Param("status","success")); + // We have a race condition here between the processing of this message by + // the initial process and the printing of the end-of-session log, triggered + // by the end our process. To delay the latter, we sleep half a second here. + usleep(500*1000); } } else { @@ -264,6 +268,10 @@ void MigrationReportPacker::ReportEndofSession::execute(MigrationReportPacker& r reportPacker.m_lc.log(LOG_ERR,"Reported end of session with error to client due to previous file errors"); if(reportPacker.m_watchdog) { reportPacker.m_watchdog->addParameter(log::Param("status","failure")); + // We have a race condition here between the processing of this message by + // the initial process and the printing of the end-of-session log, triggered + // by the end our process. To delay the latter, we sleep half a second here. + usleep(500*1000); } } reportPacker.m_continue=false; @@ -296,6 +304,10 @@ void MigrationReportPacker::ReportEndofSessionWithErrors::execute(MigrationRepor } if(reportPacker.m_watchdog) { reportPacker.m_watchdog->addParameter(log::Param("status","failure")); + // We have a race condition here between the processing of this message by + // the initial process and the printing of the end-of-session log, triggered + // by the end our process. To delay the latter, we sleep half a second here. + usleep(500*1000); } reportPacker.m_continue=false; } diff --git a/castor/tape/tapeserver/daemon/ProcessForkerConnectionHandler.cpp b/castor/tape/tapeserver/daemon/ProcessForkerConnectionHandler.cpp index 87993b842982d8ba006271452ae72bb0f41b7461..b45571d3a57c8b4355beaa2c20bcc8629eda86fc 100644 --- a/castor/tape/tapeserver/daemon/ProcessForkerConnectionHandler.cpp +++ b/castor/tape/tapeserver/daemon/ProcessForkerConnectionHandler.cpp @@ -203,7 +203,7 @@ void castor::tape::tapeserver::daemon::ProcessForkerConnectionHandler:: CatalogueDrive &drive = m_driveCatalogue.findDrive(msg.pid()); // Notify the catalogue drive of the failed session - drive.sessionFailed(); + drive.sessionKilled(msg.signal()); } catch(castor::exception::Exception &ne) { castor::exception::Exception ex; diff --git a/castor/tape/tapeserver/daemon/RecallReportPacker.cpp b/castor/tape/tapeserver/daemon/RecallReportPacker.cpp index 400419959de2e052d60c83908268fe82e30bd578..2fba5499866dfbe8313d44c5388b29301eaad8d6 100644 --- a/castor/tape/tapeserver/daemon/RecallReportPacker.cpp +++ b/castor/tape/tapeserver/daemon/RecallReportPacker.cpp @@ -155,6 +155,10 @@ void RecallReportPacker::ReportEndofSession::execute(RecallReportPacker& parent) parent.logRequestReport(chrono,"Nominal RecallReportPacker::EndofSession has been reported",LOG_INFO); if (parent.m_watchdog) { parent.m_watchdog->addParameter(log::Param("status","success")); + // We have a race condition here between the processing of this message by + // the initial process and the printing of the end-of-session log, triggered + // by the end our process. To delay the latter, we sleep half a second here. + usleep(500*1000); } } else { @@ -164,6 +168,10 @@ void RecallReportPacker::ReportEndofSession::execute(RecallReportPacker& parent) parent.logRequestReport(chrono,"reporting EndOfSessionWithError done",LOG_ERR); if (parent.m_watchdog) { parent.m_watchdog->addParameter(log::Param("status","failure")); + // We have a race condition here between the processing of this message by + // the initial process and the printing of the end-of-session log, triggered + // by the end our process. To delay the latter, we sleep half a second here. + usleep(500*1000); } } } @@ -184,6 +192,10 @@ void RecallReportPacker::ReportEndofSessionWithErrors::execute(RecallReportPacke } if (parent.m_watchdog) { parent.m_watchdog->addParameter(log::Param("status","failure")); + // We have a race condition here between the processing of this message by + // the initial process and the printing of the end-of-session log, triggered + // by the end our process. To delay the latter, we sleep half a second here. + usleep(500*1000); } } //------------------------------------------------------------------------------