Commit 19737ac9 authored by Eric Cano's avatar Eric Cano
Browse files

Added timeout errors to the finished session summary.

Added error for killed sessions.
Added delay to protect against races between end of process and status update.
Added coverage for the unlikely event where a session finishes on its own while we intend to kill it for shutdown of timeout.
parent 52ccfa32
......@@ -564,6 +564,7 @@ void castor::tape::tapeserver::daemon::CatalogueDrive::
session->sessionSucceeded();
m_vdqm.setDriveDown(m_hostName, m_config.unitName, m_config.dgn);
break;
case DRIVE_STATE_WAITSHUTDOWNKILL:
case DRIVE_STATE_WAITSHUTDOWNCLEANER:
changeState(DRIVE_STATE_SHUTDOWN);
session->sessionSucceeded();
......@@ -588,7 +589,33 @@ void castor::tape::tapeserver::daemon::CatalogueDrive::sessionFailed() {
switch(m_state) {
case DRIVE_STATE_RUNNING:
case DRIVE_STATE_WAITDOWN:
case DRIVE_STATE_WAITSHUTDOWNKILL:
return runningSessionFailed();
case DRIVE_STATE_WAITSHUTDOWNCLEANER:
return cleanerOfShutdownFailed();
default:
{
castor::exception::Exception ex;
ex.getMessage() <<
"Failed to record tape session failed for session with pid " <<
getSession().getPid() << ": Incompatible drive state: state=" <<
driveStateToStr(m_state);
delete m_session;
m_session = NULL;
throw ex;
}
}
}
//-----------------------------------------------------------------------------
// sessionKilled
//-----------------------------------------------------------------------------
void castor::tape::tapeserver::daemon::CatalogueDrive::
sessionKilled(uint32_t signal) {
switch(m_state) {
case DRIVE_STATE_RUNNING:
case DRIVE_STATE_WAITDOWN:
return runningSessionKilled(signal);
case DRIVE_STATE_WAITSHUTDOWNKILL:
return sessionKilledByShutdown();
case DRIVE_STATE_WAITSHUTDOWNCLEANER:
......@@ -614,6 +641,18 @@ void castor::tape::tapeserver::daemon::CatalogueDrive::runningSessionFailed() {
std::auto_ptr<CatalogueSession> session(m_session);
m_session = NULL;
session->sessionFailed();
changeState(DRIVE_STATE_DOWN);
m_vdqm.setDriveDown(m_hostName, m_config.unitName, m_config.dgn);
}
//------------------------------------------------------------------------------
// runningSessionKilled
//------------------------------------------------------------------------------
void castor::tape::tapeserver::daemon::CatalogueDrive::
runningSessionKilled(uint32_t signal) {
std::auto_ptr<CatalogueSession> session(m_session);
m_session = NULL;
session->sessionKilled(signal);
if(CatalogueSession::SESSION_TYPE_CLEANER != session->getType()) {
const uint32_t driveReadyDelayInSeconds = 60;
......@@ -625,6 +664,7 @@ void castor::tape::tapeserver::daemon::CatalogueDrive::runningSessionFailed() {
}
}
//------------------------------------------------------------------------------
// sessionKilledByShutdown
//------------------------------------------------------------------------------
......
......@@ -331,9 +331,16 @@ public:
void sessionSucceeded();
/**
* Informs catalogue drive that the current tape session has failed.
* Informs catalogue drive that the current tape session completed in a failed
* state (tape not unloaded, usually) and the drive should be marked as down.
*/
void sessionFailed();
/**
* Informs catalogue drive that the current tape session was killed (either
* due to a hang, of for other reason, like a dissatisfied operator.)
*/
void sessionKilled(uint32_t signal);
/**
* Informs catalogue drive that the current tape session has failed and that
......@@ -636,9 +643,15 @@ private:
/**
* Called when a running session (DRIVE_STATE_RUNNING or DRIVE_STATE_WAITDOWN)
* has failed.
* has competed with an error return value.
*/
void runningSessionFailed();
/**
* Called when a running session (DRIVE_STATE_RUNNING or DRIVE_STATE_WAITDOWN)
* has been killed.
*/
void runningSessionKilled(uint32_t signal);
/**
* Called when a running session has been intentionally killed by the shutdown
......
......@@ -29,6 +29,7 @@
#include <string>
#include <sys/types.h>
#include <unistd.h>
#include <stdint.h>
namespace castor {
namespace tape {
......@@ -91,6 +92,13 @@ public:
* To be called when the session has ended with failure.
*/
virtual void sessionFailed() = 0;
/**
* To be called when a session is killed.
* This member function is a no-op by default. (overridden by
* CatalogueTransferSession at the time of writing)
*/
virtual void sessionKilled(uint32_t signal) {}
/**
* Gets the volume identifier of the tape associated with the tape drive.
......
......@@ -141,6 +141,8 @@ bool castor::tape::tapeserver::daemon::CatalogueTransferSession::
m_log(LOG_ERR,
"Killing data-transfer session because transfer job is too late",
params);
// We will also attach the error to final log for the session
addLogParam(castor::log::Param("Error_timeoutGettingJobInfo","1"));
try {
idempotentKill(m_pid, SIGKILL);
......@@ -191,7 +193,9 @@ bool castor::tape::tapeserver::daemon::CatalogueTransferSession::
m_log(LOG_ERR,
"Killing data-transfer session because tape mount is taking too long",
params);
// We will also attach the error to final log for the session
addLogParam(castor::log::Param("Error_timeoutMountingTape","1"));
try {
idempotentKill(m_pid, SIGKILL);
m_state = WAIT_TIMEOUT_KILL;
......@@ -222,6 +226,8 @@ bool castor::tape::tapeserver::daemon::CatalogueTransferSession::
m_log(LOG_ERR,
"Killing data-transfer session because data blocks are not being moved",
params);
// We will also attach the error to final log for the session
addLogParam(castor::log::Param("Error_sessionHung","1"));
try {
idempotentKill(m_pid, SIGKILL);
......@@ -263,6 +269,18 @@ void castor::tape::tapeserver::daemon::CatalogueTransferSession::
m_sessionLogContext.log(LOG_INFO, "Tape session finished");
}
//------------------------------------------------------------------------------
// sessionFailed
//------------------------------------------------------------------------------
void castor::tape::tapeserver::daemon::CatalogueTransferSession::
sessionKilled(uint32_t signal) {
// In case of problem, we mark the session failed ourselves
m_sessionLogContext.pushOrReplace(log::Param("Error_sessionKilled","1"));
m_sessionLogContext.pushOrReplace(log::Param("killSignal",signal));
m_sessionLogContext.pushOrReplace(log::Param("status","failure"));
m_sessionLogContext.log(LOG_INFO, "Tape session finished");
}
//------------------------------------------------------------------------------
// getAssignmentTime
//------------------------------------------------------------------------------
......
......@@ -101,6 +101,11 @@ public:
* To be called when the session has ended with failure.
*/
void sessionFailed();
/**
* To be called when the session has ended with a kill.
*/
virtual void sessionKilled(uint32_t signal);
/**
* Gets the time at which the tape drive was assigned a data transfer job.
......
......@@ -249,6 +249,10 @@ void MigrationReportPacker::ReportEndofSession::execute(MigrationReportPacker& r
reportPacker.m_lc.log(LOG_INFO,"Reported end of session to client");
if(reportPacker.m_watchdog) {
reportPacker.m_watchdog->addParameter(log::Param("status","success"));
// We have a race condition here between the processing of this message by
// the initial process and the printing of the end-of-session log, triggered
// by the end our process. To delay the latter, we sleep half a second here.
usleep(500*1000);
}
}
else {
......@@ -264,6 +268,10 @@ void MigrationReportPacker::ReportEndofSession::execute(MigrationReportPacker& r
reportPacker.m_lc.log(LOG_ERR,"Reported end of session with error to client due to previous file errors");
if(reportPacker.m_watchdog) {
reportPacker.m_watchdog->addParameter(log::Param("status","failure"));
// We have a race condition here between the processing of this message by
// the initial process and the printing of the end-of-session log, triggered
// by the end our process. To delay the latter, we sleep half a second here.
usleep(500*1000);
}
}
reportPacker.m_continue=false;
......@@ -296,6 +304,10 @@ void MigrationReportPacker::ReportEndofSessionWithErrors::execute(MigrationRepor
}
if(reportPacker.m_watchdog) {
reportPacker.m_watchdog->addParameter(log::Param("status","failure"));
// We have a race condition here between the processing of this message by
// the initial process and the printing of the end-of-session log, triggered
// by the end our process. To delay the latter, we sleep half a second here.
usleep(500*1000);
}
reportPacker.m_continue=false;
}
......
......@@ -203,7 +203,7 @@ void castor::tape::tapeserver::daemon::ProcessForkerConnectionHandler::
CatalogueDrive &drive = m_driveCatalogue.findDrive(msg.pid());
// Notify the catalogue drive of the failed session
drive.sessionFailed();
drive.sessionKilled(msg.signal());
} catch(castor::exception::Exception &ne) {
castor::exception::Exception ex;
......
......@@ -155,6 +155,10 @@ void RecallReportPacker::ReportEndofSession::execute(RecallReportPacker& parent)
parent.logRequestReport(chrono,"Nominal RecallReportPacker::EndofSession has been reported",LOG_INFO);
if (parent.m_watchdog) {
parent.m_watchdog->addParameter(log::Param("status","success"));
// We have a race condition here between the processing of this message by
// the initial process and the printing of the end-of-session log, triggered
// by the end our process. To delay the latter, we sleep half a second here.
usleep(500*1000);
}
}
else {
......@@ -164,6 +168,10 @@ void RecallReportPacker::ReportEndofSession::execute(RecallReportPacker& parent)
parent.logRequestReport(chrono,"reporting EndOfSessionWithError done",LOG_ERR);
if (parent.m_watchdog) {
parent.m_watchdog->addParameter(log::Param("status","failure"));
// We have a race condition here between the processing of this message by
// the initial process and the printing of the end-of-session log, triggered
// by the end our process. To delay the latter, we sleep half a second here.
usleep(500*1000);
}
}
}
......@@ -184,6 +192,10 @@ void RecallReportPacker::ReportEndofSessionWithErrors::execute(RecallReportPacke
}
if (parent.m_watchdog) {
parent.m_watchdog->addParameter(log::Param("status","failure"));
// We have a race condition here between the processing of this message by
// the initial process and the printing of the end-of-session log, triggered
// by the end our process. To delay the latter, we sleep half a second here.
usleep(500*1000);
}
}
//------------------------------------------------------------------------------
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment