Commit ebe48d83 authored by Cedric Caffy's avatar Cedric Caffy
Browse files

cta-taped sets the reason why a drive has been put down

parent a29df67b
......@@ -65,7 +65,7 @@ namespace cta {
bool bothSidesComplete() override { return false; }
void setDriveStatus(cta::common::dataStructures::DriveStatus status) override {};
void setDriveStatus(cta::common::dataStructures::DriveStatus status, const cta::optional<std::string> & reason) override {};
void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) override {};
......
......@@ -146,8 +146,11 @@ schedule:
cta::common::dataStructures::DesiredDriveState driveState;
driveState.up = false;
driveState.forceDown = false;
std::string errorMsg = "A tape was detected in the drive. Putting the drive back down.";
int logLevel = cta::log::ERR;
driveState.setReasonFromLogMsg(logLevel,errorMsg);
m_scheduler.setDesiredDriveState(securityIdentity, m_driveConfig.unitName, driveState, lc);
lc.log(cta::log::ERR, "A tape was detected in the drive. Putting the drive back down.");
lc.log(logLevel, errorMsg);
goto schedule;
} else {
lc.log(cta::log::INFO, "No tape detected in the drive. Proceeding with scheduling.");
......@@ -162,14 +165,17 @@ schedule:
tapeMount.reset(m_scheduler.getNextMount(m_driveConfig.logicalLibrary, m_driveConfig.unitName, lc).release());
} catch (cta::exception::Exception & e) {
cta::log::ScopedParamContainer localParams(lc);
localParams.add("errorMessage", e.getMessageValue());
lc.log(cta::log::ERR, "Error while scheduling new mount. Putting the drive down. Stack trace follows.");
lc.logBacktrace(cta::log::ERR, e.backtrace());
std::string exceptionMsg = e.getMessageValue();
int logLevel = cta::log::ERR;
localParams.add("errorMessage", exceptionMsg);
lc.log(logLevel, "Error while scheduling new mount. Putting the drive down. Stack trace follows.");
lc.logBacktrace(logLevel, e.backtrace());
m_scheduler.reportDriveStatus(m_driveInfo, cta::common::dataStructures::MountType::NoMount, cta::common::dataStructures::DriveStatus::Down, lc);
cta::common::dataStructures::SecurityIdentity cliId;
cta::common::dataStructures::DesiredDriveState driveState;
driveState.up = false;
driveState.forceDown = false;
driveState.setReasonFromLogMsg(cta::log::ERR,exceptionMsg);
m_scheduler.setDesiredDriveState(cliId, m_driveConfig.unitName, driveState, lc);
return MARK_DRIVE_AS_DOWN;
}
......
......@@ -46,7 +46,7 @@ namespace unitTests{
cta::disk::DiskSystemFreeSpaceList & diskSystemFreeSpace, cta::log::LogContext& logContext) override { throw std::runtime_error("Not implemented");}
void complete(time_t completionTime) override { throw std::runtime_error("Not implemented"); }
void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime) override { throw std::runtime_error("Not implemented"); }
void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime, const cta::optional<std::string> & reason) override { throw std::runtime_error("Not implemented"); }
void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) override { throw std::runtime_error("Not implemented"); }
void flushAsyncSuccessReports(std::list<cta::SchedulerDatabase::RetrieveJob*>& jobsBatch, cta::log::LogContext& lc) override { throw std::runtime_error("Not implemented"); }
};
......
......@@ -40,7 +40,7 @@ namespace unitTests{
std::list<std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> > getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested, cta::disk::DiskSystemFreeSpaceList & diskSystemFreeSpace, cta::log::LogContext& logContext) override { throw std::runtime_error("Not implemented");}
void complete(time_t completionTime) override { throw std::runtime_error("Not implemented"); }
void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime) override { throw std::runtime_error("Not implemented"); }
void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime, const cta::optional<std::string> & reason) override { throw std::runtime_error("Not implemented"); }
void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) override { throw std::runtime_error("Not implemented"); }
void flushAsyncSuccessReports(std::list<cta::SchedulerDatabase::RetrieveJob*>& jobsBatch, cta::log::LogContext& lc) override { throw std::runtime_error("Not implemented"); }
};
......
......@@ -220,13 +220,13 @@ void MigrationReportPacker::ReportSkipped::execute(MigrationReportPacker& report
//------------------------------------------------------------------------------
//reportDriveStatus
//------------------------------------------------------------------------------
void MigrationReportPacker::reportDriveStatus(cta::common::dataStructures::DriveStatus status, cta::log::LogContext & lc) {
void MigrationReportPacker::reportDriveStatus(cta::common::dataStructures::DriveStatus status,const cta::optional<std::string> & reason, cta::log::LogContext & lc) {
cta::log::ScopedParamContainer params(lc);
params.add("type", "ReportDriveStatus")
.add("Status", cta::common::dataStructures::toString(status));
lc.log(cta::log::DEBUG, "In MigrationReportPacker::reportDriveStatus(), pushing a report.");
cta::threading::MutexLocker ml(m_producterProtection);
m_fifo.push(new ReportDriveStatus(status));
m_fifo.push(new ReportDriveStatus(status,reason));
}
//------------------------------------------------------------------------------
......@@ -236,7 +236,7 @@ void MigrationReportPacker::ReportDriveStatus::execute(MigrationReportPacker& pa
cta::log::ScopedParamContainer params(parent.m_lc);
params.add("status", cta::common::dataStructures::toString(m_status));
parent.m_lc.log(cta::log::DEBUG, "In MigrationReportPacker::ReportDriveStatus::execute(): reporting drive status.");
parent.m_archiveMount->setDriveStatus(m_status);
parent.m_archiveMount->setDriveStatus(m_status,m_reason);
}
//------------------------------------------------------------------------------
......
......@@ -97,7 +97,7 @@ public:
* @param state the new drive state.
* @param lc log context provided by the calling thread.
*/
virtual void reportDriveStatus(cta::common::dataStructures::DriveStatus status, cta::log::LogContext & lc);
virtual void reportDriveStatus(cta::common::dataStructures::DriveStatus status, const cta::optional<std::string> & reason, cta::log::LogContext & lc);
/**
* Create into the MigrationReportPacker a report for the nominal end of session
......@@ -171,8 +171,9 @@ private:
class ReportDriveStatus : public Report {
cta::common::dataStructures::DriveStatus m_status;
cta::optional<std::string> m_reason;
public:
ReportDriveStatus(cta::common::dataStructures::DriveStatus status): m_status(status) {}
ReportDriveStatus(cta::common::dataStructures::DriveStatus status, const cta::optional<std::string> & reason): m_status(status),m_reason(reason) {}
void execute(MigrationReportPacker& reportPacker) override;
};
......
......@@ -89,9 +89,9 @@ void RecallReportPacker::reportEndOfSession(){
//------------------------------------------------------------------------------
//reportDriveStatus
//------------------------------------------------------------------------------
void RecallReportPacker::reportDriveStatus(cta::common::dataStructures::DriveStatus status) {
void RecallReportPacker::reportDriveStatus(cta::common::dataStructures::DriveStatus status, const cta::optional<std::string> & reason) {
cta::threading::MutexLocker ml(m_producterProtection);
m_fifo.push(new ReportDriveStatus(status));
m_fifo.push(new ReportDriveStatus(status,reason));
}
......@@ -166,7 +166,7 @@ bool RecallReportPacker::ReportEndofSession::goingToEnd() {
//ReportDriveStatus::execute
//------------------------------------------------------------------------------
void RecallReportPacker::ReportDriveStatus::execute(RecallReportPacker& parent){
parent.m_retrieveMount->setDriveStatus(m_status);
parent.m_retrieveMount->setDriveStatus(m_status,m_reason);
if(m_status==cta::common::dataStructures::DriveStatus::Unmounting) {
parent.setTapeDone();
parent.setTapeComplete();
......
......@@ -86,7 +86,7 @@ public:
* function is to be used by the tape thread when running.
* @param state the new drive state.
*/
virtual void reportDriveStatus(cta::common::dataStructures::DriveStatus status);
virtual void reportDriveStatus(cta::common::dataStructures::DriveStatus status, const cta::optional<std::string> & reason = cta::nullopt);
/**
* Flag disk thread as done.
......@@ -166,9 +166,10 @@ private:
class ReportDriveStatus : public Report {
cta::common::dataStructures::DriveStatus m_status;
cta::optional<std::string> m_reason;
public:
ReportDriveStatus(cta::common::dataStructures::DriveStatus status): m_status(status) {}
ReportDriveStatus(cta::common::dataStructures::DriveStatus status,const cta::optional<std::string> & reason): m_status(status), m_reason(reason) {}
void execute(RecallReportPacker& reportPacker) override;
bool goingToEnd() override;
};
......
......@@ -134,7 +134,7 @@ namespace unitTests
std::list<std::unique_ptr<cta::SchedulerDatabase::RetrieveJob> > getNextJobBatch(uint64_t filesRequested, uint64_t bytesRequested,
cta::disk::DiskSystemFreeSpaceList & diskSystemFreeSpace, cta::log::LogContext& logContext) override { throw std::runtime_error("Not implemented");}
void complete(time_t completionTime) override { throw std::runtime_error("Not implemented"); }
void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime) override { throw std::runtime_error("Not implemented"); }
void setDriveStatus(cta::common::dataStructures::DriveStatus status, time_t completionTime,const cta::optional<std::string> & reason) override { throw std::runtime_error("Not implemented"); }
void setTapeSessionStats(const castor::tape::tapeserver::daemon::TapeSessionStats &stats) override { throw std::runtime_error("Not implemented"); }
void flushAsyncSuccessReports(std::list<cta::SchedulerDatabase::RetrieveJob*>& jobsBatch, cta::log::LogContext& lc) override { throw std::runtime_error("Not implemented"); }
};
......
......@@ -123,10 +123,13 @@ castor::tape::tapeserver::daemon::TapeReadSingleThread::TapeCleaning::~TapeClean
} catch(const cta::exception::Exception& ex){
// Something failed during the cleaning
m_this.m_hardwareStatus = Session::MARK_DRIVE_AS_DOWN;
m_this.m_rrp.reportDriveStatus(cta::common::dataStructures::DriveStatus::Down);
const int logLevel = cta::log::ERR;
const std::string errorMsg = "Exception in TapeReadSingleThread-TapeCleaning when unmounting the tape. Putting the drive down.";
cta::optional<std::string> reason = cta::common::dataStructures::DesiredDriveState::generateReasonFromLogMsg(logLevel,errorMsg);
m_this.m_rrp.reportDriveStatus(cta::common::dataStructures::DriveStatus::Down,reason);
cta::log::ScopedParamContainer scoped(m_this.m_logContext);
scoped.add("exceptionMessage", ex.getMessageValue());
m_this.m_logContext.log(cta::log::ERR, "Exception in TapeReadSingleThread-TapeCleaning when unmounting the tape. Putting the drive down.");
m_this.m_logContext.log(logLevel, errorMsg);
try {
if (currentErrorToCount.size()) {
m_this.m_watchdog.addToErrorCount(currentErrorToCount);
......@@ -135,8 +138,11 @@ castor::tape::tapeserver::daemon::TapeReadSingleThread::TapeCleaning::~TapeClean
} catch (...) {
// Notify something failed during the cleaning
m_this.m_hardwareStatus = Session::MARK_DRIVE_AS_DOWN;
m_this.m_rrp.reportDriveStatus(cta::common::dataStructures::DriveStatus::Down);
m_this.m_logContext.log(cta::log::ERR, "Non-Castor exception in TapeReadSingleThread-TapeCleaning when unmounting the tape. Putting the drive down.");
const int logLevel = cta::log::ERR;
const std::string errorMsg = "Non-Castor exception in TapeReadSingleThread-TapeCleaning when unmounting the tape. Putting the drive down.";
cta::optional<std::string> reason = cta::common::dataStructures::DesiredDriveState::generateReasonFromLogMsg(logLevel,errorMsg);
m_this.m_rrp.reportDriveStatus(cta::common::dataStructures::DriveStatus::Down,reason);
m_this.m_logContext.log(logLevel, errorMsg);
try {
if (currentErrorToCount.size()) {
m_this.m_watchdog.addToErrorCount(currentErrorToCount);
......
......@@ -65,7 +65,7 @@ castor::tape::tapeserver::daemon::TapeWriteSingleThread::TapeCleaning::~TapeClea
m_this.m_logContext.log(cta::log::ERR, "Failed to turn off encryption before unmounting");
}
m_this.m_stats.encryptionControlTime += m_timer.secs(cta::utils::Timer::resetCounter);
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::CleaningUp, m_this.m_logContext);
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::CleaningUp, cta::nullopt, m_this.m_logContext);
// This out-of-try-catch variables allows us to record the stage of the
// process we're in, and to count the error if it occurs.
// We will not record errors for an empty string. This will allow us to
......@@ -92,7 +92,7 @@ castor::tape::tapeserver::daemon::TapeWriteSingleThread::TapeCleaning::~TapeClea
}
// in the special case of a "manual" mode tape, we should skip the unload too.
if (cta::mediachanger::TAPE_LIBRARY_TYPE_MANUAL != m_this.m_drive.config.librarySlot().getLibraryType()) {
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Unloading, m_this.m_logContext);
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Unloading,cta::nullopt, m_this.m_logContext);
m_this.m_drive.unloadTape();
m_this.m_logContext.log(cta::log::INFO, "TapeWriteSingleThread: Tape unloaded");
} else {
......@@ -103,10 +103,10 @@ castor::tape::tapeserver::daemon::TapeWriteSingleThread::TapeCleaning::~TapeClea
// In case of manual mode, this will be filtered by the rmc daemon
// (which will do nothing)
currentErrorToCount = "Error_tapeDismount";
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Unmounting, m_this.m_logContext);
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Unmounting, cta::nullopt, m_this.m_logContext);
m_this.m_mc.dismountTape(m_this.m_volInfo.vid, m_this.m_drive.config.librarySlot());
m_this.m_drive.disableLogicalBlockProtection();
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Up, m_this.m_logContext);
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Up, cta::nullopt, m_this.m_logContext);
m_this.m_stats.unmountTime += m_timer.secs(cta::utils::Timer::resetCounter);
m_this.m_logContext.log(cta::log::INFO, cta::mediachanger::TAPE_LIBRARY_TYPE_MANUAL != m_this.m_drive.config.librarySlot().getLibraryType() ?
"TapeWriteSingleThread : tape unmounted":"TapeWriteSingleThread : tape NOT unmounted (manual mode)");
......@@ -117,10 +117,13 @@ castor::tape::tapeserver::daemon::TapeWriteSingleThread::TapeCleaning::~TapeClea
catch(const cta::exception::Exception& ex){
// Notify something failed during the cleaning
m_this.m_hardwareStatus = Session::MARK_DRIVE_AS_DOWN;
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Down, m_this.m_logContext);
const int logLevel = cta::log::ERR;
const std::string errorMsg = "Exception in TapeWriteSingleThread-TapeCleaning. Putting the drive down.";
cta::optional<std::string> reason = cta::common::dataStructures::DesiredDriveState::generateReasonFromLogMsg(logLevel,errorMsg);
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Down,reason, m_this.m_logContext);
cta::log::ScopedParamContainer scoped(m_this.m_logContext);
scoped.add("exceptionMessage", ex.getMessageValue());
m_this.m_logContext.log(cta::log::ERR, "Exception in TapeWriteSingleThread-TapeCleaning. Putting the drive down.");
m_this.m_logContext.log(logLevel, errorMsg);
// As we do not throw exceptions from here, the watchdog signalling has
// to occur from here.
try {
......@@ -130,9 +133,12 @@ castor::tape::tapeserver::daemon::TapeWriteSingleThread::TapeCleaning::~TapeClea
} catch (...) {}
} catch (...) {
// Notify something failed during the cleaning
const int logLevel = cta::log::ERR;
const std::string errorMsg = "Non-Castor exception in TapeWriteSingleThread-TapeCleaning when unmounting the tape. Putting the drive down.";
cta::optional<std::string> reason = cta::common::dataStructures::DesiredDriveState::generateReasonFromLogMsg(logLevel,errorMsg);
m_this.m_hardwareStatus = Session::MARK_DRIVE_AS_DOWN;
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Down, m_this.m_logContext);
m_this.m_logContext.log(cta::log::ERR, "Non-Castor exception in TapeWriteSingleThread-TapeCleaning when unmounting the tape. Putting the drive down.");
m_this.m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Down,reason,m_this.m_logContext);
m_this.m_logContext.log(logLevel,errorMsg);
try {
if (currentErrorToCount.size()) {
m_this.m_watchdog.addToErrorCount(currentErrorToCount);
......@@ -306,7 +312,7 @@ void castor::tape::tapeserver::daemon::TapeWriteSingleThread::run() {
// will also take care of the TapeServerReporter
//
TapeCleaning cleaner(*this, timer);
m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Mounting, m_logContext);
m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Mounting,cta::nullopt, m_logContext);
// Before anything, the tape should be mounted
// This call does the logging of the mount
cta::log::ScopedParamContainer params(m_logContext);
......@@ -404,7 +410,7 @@ void castor::tape::tapeserver::daemon::TapeWriteSingleThread::run() {
// Tasks handle their error logging themselves.
currentErrorToCount = "";
std::unique_ptr<TapeWriteTask> task;
m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Transferring, m_logContext);
m_reportPacker.reportDriveStatus(cta::common::dataStructures::DriveStatus::Transferring,cta::nullopt, m_logContext);
while(1) {
//get a task
task.reset(m_tasks.pop());
......
......@@ -952,7 +952,9 @@ int DriveHandler::runChild() {
if (m_previousSession == PreviousSession::Crashed && m_previousType == SessionType::Cleanup) {
log::ScopedParamContainer params(lc);
params.add("tapeDrive", m_configLine.unitName);
lc.log(log::ERR, "In DriveHandler::runChild(): the cleaner session crashed. Putting the drive down.");
int logLevel = log::ERR;
std::string errorMsg = "In DriveHandler::runChild(): the cleaner session crashed. Putting the drive down.";
lc.log(log::ERR, errorMsg);
// Get hold of the scheduler.
try {
cta::common::dataStructures::DriveInfo driveInfo;
......@@ -964,6 +966,7 @@ int DriveHandler::runChild() {
cta::common::dataStructures::DesiredDriveState driveState;
driveState.up = false;
driveState.forceDown = false;
driveState.setReasonFromLogMsg(logLevel,errorMsg);
scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName,driveState, lc);
return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN;
} catch (cta::exception::Exception &ex) {
......@@ -982,7 +985,9 @@ int DriveHandler::runChild() {
SessionState::Running, SessionState::Unmounting };
if (m_previousSession == PreviousSession::Crashed && statesRequiringCleaner.count(m_previousState)) {
if (!m_previousVid.size()) {
lc.log(log::ERR, "In DriveHandler::runChild(): Should run cleaner but VID is missing. Putting the drive down.");
int logLevel = log::ERR;
std::string errorMsg = "In DriveHandler::runChild(): Should run cleaner but VID is missing. Putting the drive down.";
lc.log(log::ERR, errorMsg);
try {
cta::common::dataStructures::DriveInfo driveInfo;
driveInfo.driveName=m_configLine.unitName;
......@@ -993,6 +998,7 @@ int DriveHandler::runChild() {
cta::common::dataStructures::DesiredDriveState driveState;
driveState.up = false;
driveState.forceDown = false;
driveState.setReasonFromLogMsg(logLevel,errorMsg);
scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName, driveState, lc);
return castor::tape::tapeserver::daemon::Session::MARK_DRIVE_AS_DOWN;
} catch (cta::exception::Exception &ex) {
......@@ -1078,7 +1084,9 @@ int DriveHandler::runChild() {
// Log that we put the drive's desired state to down and do it.
log::ScopedParamContainer params(lc);
params.add("tapeDrive", m_configLine.unitName);
lc.log(log::INFO, "Setting the drive down at daemon startup");
int logLevel = log::INFO;
std::string msg = "Setting the drive down at daemon startup";
lc.log(logLevel, msg);
try {
// Before setting the desired state as down, we have to make sure the drive exists in the registry.
// this is done by reporting the drive as down first.
......@@ -1105,7 +1113,13 @@ int DriveHandler::runChild() {
cta::common::dataStructures::SecurityIdentity securityIdentity;
cta::common::dataStructures::DesiredDriveState driveState;
driveState.up = false;
driveState.forceDown = false;
driveState.forceDown = false;
// Get the drive state to see if there is a reason or not, we don't want to change the reason
// why a drive is down at the startup of the tapeserver
cta::common::dataStructures::DesiredDriveState currentDesiredDriveState = scheduler.getDesiredDriveState(m_configLine.unitName,lc);
if(!currentDesiredDriveState.reason){
driveState.setReasonFromLogMsg(logLevel,msg);
}
scheduler.setDesiredDriveState(securityIdentity, m_configLine.unitName, driveState, lc);
scheduler.reportDriveConfig(m_configLine,m_tapedConfig,lc);
} catch (cta::exception::Exception & ex) {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment