Commit fc963e95 authored by Eric Cano's avatar Eric Cano
Browse files

Added recording and reporting of job failure reasons.

The error(s) gets recorded at failure time, and the list is printed at
job deletion time.
parent 65c592f7
......@@ -92,8 +92,8 @@ namespace unitTests {
return fileReport;
}
void failed(const cta::exception::Exception& ex, cta::log::LogContext & lc) override {
void failed(const std::string& failureReason, cta::log::LogContext& lc) override {
failuresRef++;
}
......
......@@ -24,6 +24,7 @@
#include "castor/tape/tapeserver/daemon/RecallReportPacker.hpp"
#include "castor/tape/tapeserver/daemon/TaskWatchDog.hpp"
#include "common/log/Logger.hpp"
#include "common/utils/utils.hpp"
#include <signal.h>
#include <iostream>
......@@ -69,8 +70,10 @@ void RecallReportPacker::reportCompletedJob(std::unique_ptr<cta::RetrieveJob> su
//------------------------------------------------------------------------------
//reportFailedJob
//------------------------------------------------------------------------------
void RecallReportPacker::reportFailedJob(std::unique_ptr<cta::RetrieveJob> failedRetrieveJob){
std::unique_ptr<Report> rep(new ReportError(std::move(failedRetrieveJob)));
void RecallReportPacker::reportFailedJob(std::unique_ptr<cta::RetrieveJob> failedRetrieveJob, const cta::exception::Exception & ex){
std::string failureLog = cta::utils::getCurrentLocalTime() + " " + cta::utils::getShortHostname() +
" " + ex.what();
std::unique_ptr<Report> rep(new ReportError(std::move(failedRetrieveJob), failureLog));
cta::threading::MutexLocker ml(m_producterProtection);
m_fifo.push(rep.release());
}
......@@ -199,14 +202,22 @@ bool RecallReportPacker::ReportEndofSessionWithErrors::goingToEnd() {
//------------------------------------------------------------------------------
//ReportError::execute
//------------------------------------------------------------------------------
void RecallReportPacker::ReportError::execute(RecallReportPacker& parent){
parent.m_errorHappened=true;
void RecallReportPacker::ReportError::execute(RecallReportPacker& reportPacker){
reportPacker.m_errorHappened=true;
{
cta::log::ScopedParamContainer params(parent.m_lc);
params.add("errorMessage", m_failedRetrieveJob->failureMessage);
parent.m_lc.log(cta::log::ERR, "In RecallReportPacker::ReportError::execute(): processing error message");
cta::log::ScopedParamContainer params(reportPacker.m_lc);
params.add("failureLog", m_failureLog)
.add("fileId", m_failedRetrieveJob->archiveFile.archiveFileID);
reportPacker.m_lc.log(cta::log::ERR,"In RecallReportPacker::ReportError::execute(): failing retrieve job after exception.");
}
try {
m_failedRetrieveJob->failed(m_failureLog, reportPacker.m_lc);
} catch (cta::exception::Exception & ex) {
cta::log::ScopedParamContainer params(reportPacker.m_lc);
params.add("ExceptionMSG", ex.getMessageValue())
.add("fileId", m_failedRetrieveJob->archiveFile.archiveFileID);
reportPacker.m_lc.log(cta::log::ERR,"In RecallReportPacker::ReportError::execute(): call to m_failedRetrieveJob->failed() threw an exception.");
}
m_failedRetrieveJob->failed(m_failedRetrieveJob->failureMessage, parent.m_lc);
}
//------------------------------------------------------------------------------
......
......@@ -62,7 +62,7 @@ public:
* @param migratedFile the file which failed
* @param ex the reason for the failure
*/
virtual void reportFailedJob(std::unique_ptr<cta::RetrieveJob> failedRetrieveJob);
virtual void reportFailedJob(std::unique_ptr<cta::RetrieveJob> failedRetrieveJob, const cta::exception::Exception & ex);
/**
* Create into the MigrationReportPacker a report for the nominal end of session
......@@ -146,14 +146,14 @@ private:
void execute(RecallReportPacker& reportPacker) override;
};
class ReportError : public Report {
const std::string m_failureLog;
/**
* The failed retrieve job to be reported immediately
*/
std::unique_ptr<cta::RetrieveJob> m_failedRetrieveJob;
public:
ReportError(std::unique_ptr<cta::RetrieveJob> failedRetrieveJob):
m_failedRetrieveJob(std::move(failedRetrieveJob)) {
}
ReportError(std::unique_ptr<cta::RetrieveJob> failedRetrieveJob, const std::string &failureLog):
m_failureLog(failureLog), m_failedRetrieveJob(std::move(failedRetrieveJob)) {}
void execute(RecallReportPacker& reportPacker) override;
};
......
......@@ -52,13 +52,13 @@ protected:
MockRetrieveJobExternalStats(cta::RetrieveMount & rm, int & completes, int &failures):
MockRetrieveJob(rm), completesRef(completes), failuresRef(failures) {}
virtual void asyncComplete() override {
void asyncComplete() override {
completesRef++;
}
virtual void checkComplete() override {}
virtual void failed(const std::string & errorReport, cta::log::LogContext &) override {
void checkComplete() override {}
void failed(const std::string& failureReason, cta::log::LogContext&) override {
failuresRef++;
}
......@@ -149,8 +149,7 @@ TEST_F(castor_tape_tapeserver_daemon_RecallReportPackerTest, RecallReportPackerB
const std::string error_msg = "ERROR_TEST_MSG";
const cta::exception::Exception ex(error_msg);
job3->failureMessage = ex.getMessageValue();
rrp.reportFailedJob(std::move(job3));
rrp.reportFailedJob(std::move(job3), ex);
rrp.setDiskDone();
rrp.setTapeDone();
......
......@@ -57,20 +57,20 @@ namespace unitTests
}; // class castor_tape_tapeserver_daemonTest
struct MockRecallReportPacker : public RecallReportPacker {
void reportCompletedJob(std::unique_ptr<cta::RetrieveJob> successfulRetrieveJob) {
void reportCompletedJob(std::unique_ptr<cta::RetrieveJob> successfulRetrieveJob) override {
cta::threading::MutexLocker ml(m_mutex);
completeJobs++;
}
void reportFailedJob(std::unique_ptr<cta::RetrieveJob> failedRetrieveJob) {
void reportFailedJob(std::unique_ptr<cta::RetrieveJob> failedRetrieveJob, const cta::exception::Exception & ex) override {
cta::threading::MutexLocker ml(m_mutex);
failedJobs++;
}
void disableBulk() {}
void reportEndOfSession() {
void disableBulk() override {}
void reportEndOfSession() override {
cta::threading::MutexLocker ml(m_mutex);
endSessions++;
}
void reportEndOfSessionWithErrors(const std::string msg, int error_code) {
void reportEndOfSessionWithErrors(const std::string msg, int error_code) override {
cta::threading::MutexLocker ml(m_mutex);
endSessionsWithError++;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment