Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
dCache
cta
Commits
fc963e95
Commit
fc963e95
authored
Apr 03, 2018
by
Eric Cano
Browse files
Added recording and reporting of job failure reasons.
The error(s) gets recorded at failure time, and the list is printed at job deletion time.
parent
65c592f7
Changes
25
Hide whitespace changes
Inline
Side-by-side
objectstore/ArchiveRequest.cpp
View file @
fc963e95
...
@@ -85,7 +85,7 @@ bool cta::objectstore::ArchiveRequest::setJobSuccessful(uint16_t copyNumber) {
...
@@ -85,7 +85,7 @@ bool cta::objectstore::ArchiveRequest::setJobSuccessful(uint16_t copyNumber) {
}
}
bool
cta
::
objectstore
::
ArchiveRequest
::
addJobFailure
(
uint16_t
copyNumber
,
bool
cta
::
objectstore
::
ArchiveRequest
::
addJobFailure
(
uint16_t
copyNumber
,
uint64_t
mountId
,
log
::
LogContext
&
lc
)
{
uint64_t
mountId
,
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
)
{
checkPayloadWritable
();
checkPayloadWritable
();
// Find the job and update the number of failures
// Find the job and update the number of failures
// (and return the job status: failed (true) or to be retried (false))
// (and return the job status: failed (true) or to be retried (false))
...
@@ -99,6 +99,7 @@ bool cta::objectstore::ArchiveRequest::addJobFailure(uint16_t copyNumber,
...
@@ -99,6 +99,7 @@ bool cta::objectstore::ArchiveRequest::addJobFailure(uint16_t copyNumber,
j
.
set_lastmountwithfailure
(
mountId
);
j
.
set_lastmountwithfailure
(
mountId
);
}
}
j
.
set_totalretries
(
j
.
totalretries
()
+
1
);
j
.
set_totalretries
(
j
.
totalretries
()
+
1
);
*
j
.
mutable_failurelogs
()
->
Add
()
=
failureReason
;
}
}
if
(
j
.
totalretries
()
>=
j
.
maxtotalretries
())
{
if
(
j
.
totalretries
()
>=
j
.
maxtotalretries
())
{
j
.
set_status
(
serializers
::
AJS_Failed
);
j
.
set_status
(
serializers
::
AJS_Failed
);
...
@@ -617,8 +618,12 @@ bool ArchiveRequest::finishIfNecessary(log::LogContext & lc) {
...
@@ -617,8 +618,12 @@ bool ArchiveRequest::finishIfNecessary(log::LogContext & lc) {
for
(
auto
&
j
:
jl
)
for
(
auto
&
j
:
jl
)
if
(
!
finishedStatuses
.
count
(
j
.
status
()))
if
(
!
finishedStatuses
.
count
(
j
.
status
()))
return
false
;
return
false
;
remove
();
log
::
ScopedParamContainer
params
(
lc
);
log
::
ScopedParamContainer
params
(
lc
);
size_t
failureNumber
=
0
;
for
(
auto
failure
:
getFailures
())
{
params
.
add
(
std
::
string
(
"failure"
)
+
std
::
to_string
(
failureNumber
),
failure
);
}
remove
();
params
.
add
(
"archiveRequestObject"
,
getAddressIfSet
());
params
.
add
(
"archiveRequestObject"
,
getAddressIfSet
());
for
(
auto
&
j
:
jl
)
{
for
(
auto
&
j
:
jl
)
{
params
.
add
(
std
::
string
(
"statusForCopyNb"
)
+
std
::
to_string
(
j
.
copynb
()),
statusToString
(
j
.
status
()));
params
.
add
(
std
::
string
(
"statusForCopyNb"
)
+
std
::
to_string
(
j
.
copynb
()),
statusToString
(
j
.
status
()));
...
@@ -637,5 +642,17 @@ std::string ArchiveRequest::dump() {
...
@@ -637,5 +642,17 @@ std::string ArchiveRequest::dump() {
return
headerDump
;
return
headerDump
;
}
}
std
::
list
<
std
::
string
>
ArchiveRequest
::
getFailures
()
{
checkPayloadReadable
();
std
::
list
<
std
::
string
>
ret
;
for
(
auto
&
j
:
m_payload
.
jobs
())
{
for
(
auto
&
f
:
j
.
failurelogs
())
{
ret
.
push_back
(
f
);
}
}
return
ret
;
}
}}
// namespace cta::objectstore
}}
// namespace cta::objectstore
objectstore/ArchiveRequest.hpp
View file @
fc963e95
...
@@ -48,7 +48,7 @@ public:
...
@@ -48,7 +48,7 @@ public:
void
setJobSelected
(
uint16_t
copyNumber
,
const
std
::
string
&
owner
);
void
setJobSelected
(
uint16_t
copyNumber
,
const
std
::
string
&
owner
);
void
setJobPending
(
uint16_t
copyNumber
);
void
setJobPending
(
uint16_t
copyNumber
);
bool
setJobSuccessful
(
uint16_t
copyNumber
);
//< returns true if this is the last job
bool
setJobSuccessful
(
uint16_t
copyNumber
);
//< returns true if this is the last job
bool
addJobFailure
(
uint16_t
copyNumber
,
uint64_t
sessionId
,
log
::
LogContext
&
lc
);
//< returns true the job is failed
bool
addJobFailure
(
uint16_t
copyNumber
,
uint64_t
sessionId
,
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
);
//< returns true the job is failed
struct
RetryStatus
{
struct
RetryStatus
{
uint64_t
retriesWithinMount
=
0
;
uint64_t
retriesWithinMount
=
0
;
uint64_t
maxRetriesWithinMount
=
0
;
uint64_t
maxRetriesWithinMount
=
0
;
...
@@ -56,6 +56,7 @@ public:
...
@@ -56,6 +56,7 @@ public:
uint64_t
maxTotalRetries
=
0
;
uint64_t
maxTotalRetries
=
0
;
};
};
RetryStatus
getRetryStatus
(
uint16_t
copyNumber
);
RetryStatus
getRetryStatus
(
uint16_t
copyNumber
);
std
::
list
<
std
::
string
>
getFailures
();
serializers
::
ArchiveJobStatus
getJobStatus
(
uint16_t
copyNumber
);
serializers
::
ArchiveJobStatus
getJobStatus
(
uint16_t
copyNumber
);
std
::
string
statusToString
(
const
serializers
::
ArchiveJobStatus
&
status
);
std
::
string
statusToString
(
const
serializers
::
ArchiveJobStatus
&
status
);
bool
finishIfNecessary
(
log
::
LogContext
&
lc
);
/**< Handling of the consequences of a job status change for the entire request.
bool
finishIfNecessary
(
log
::
LogContext
&
lc
);
/**< Handling of the consequences of a job status change for the entire request.
...
...
objectstore/RetrieveRequest.cpp
View file @
fc963e95
...
@@ -314,7 +314,8 @@ auto RetrieveRequest::getJobs() -> std::list<JobDump> {
...
@@ -314,7 +314,8 @@ auto RetrieveRequest::getJobs() -> std::list<JobDump> {
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// RetrieveRequest::addJobFailure()
// RetrieveRequest::addJobFailure()
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
bool
RetrieveRequest
::
addJobFailure
(
uint16_t
copyNumber
,
uint64_t
mountId
,
log
::
LogContext
&
lc
)
{
bool
RetrieveRequest
::
addJobFailure
(
uint16_t
copyNumber
,
uint64_t
mountId
,
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
)
{
checkPayloadWritable
();
checkPayloadWritable
();
// Find the job and update the number of failures
// Find the job and update the number of failures
// (and return the full request status: failed (true) or to be retried (false))
// (and return the full request status: failed (true) or to be retried (false))
...
@@ -329,6 +330,7 @@ bool RetrieveRequest::addJobFailure(uint16_t copyNumber, uint64_t mountId, log::
...
@@ -329,6 +330,7 @@ bool RetrieveRequest::addJobFailure(uint16_t copyNumber, uint64_t mountId, log::
j
.
set_lastmountwithfailure
(
mountId
);
j
.
set_lastmountwithfailure
(
mountId
);
}
}
j
.
set_totalretries
(
j
.
totalretries
()
+
1
);
j
.
set_totalretries
(
j
.
totalretries
()
+
1
);
*
j
.
mutable_failurelogs
()
->
Add
()
=
failureReason
;
}
}
if
(
j
.
totalretries
()
>=
j
.
maxtotalretries
())
{
if
(
j
.
totalretries
()
>=
j
.
maxtotalretries
())
{
j
.
set_status
(
serializers
::
RJS_Failed
);
j
.
set_status
(
serializers
::
RJS_Failed
);
...
@@ -577,8 +579,23 @@ RetrieveRequest::AsyncJobDeleter * RetrieveRequest::asyncDeleteJob() {
...
@@ -577,8 +579,23 @@ RetrieveRequest::AsyncJobDeleter * RetrieveRequest::asyncDeleteJob() {
// RetrieveRequest::AsyncJobDeleter::wait()
// RetrieveRequest::AsyncJobDeleter::wait()
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
void
RetrieveRequest
::
AsyncJobDeleter
::
wait
()
{
void
RetrieveRequest
::
AsyncJobDeleter
::
wait
()
{
m_backendDeleter
->
wait
();
m_backendDeleter
->
wait
();
}
}
//------------------------------------------------------------------------------
// RetrieveRequest::getFailures()
//------------------------------------------------------------------------------
std
::
list
<
std
::
string
>
RetrieveRequest
::
getFailures
()
{
checkPayloadReadable
();
std
::
list
<
std
::
string
>
ret
;
for
(
auto
&
j
:
m_payload
.
jobs
())
{
for
(
auto
&
f
:
j
.
failurelogs
())
{
ret
.
push_back
(
f
);
}
}
return
ret
;
}
}}
// namespace cta::objectstore
}}
// namespace cta::objectstore
objectstore/RetrieveRequest.hpp
View file @
fc963e95
...
@@ -64,7 +64,8 @@ public:
...
@@ -64,7 +64,8 @@ public:
AsyncJobDeleter
*
asyncDeleteJob
();
AsyncJobDeleter
*
asyncDeleteJob
();
JobDump
getJob
(
uint16_t
copyNb
);
JobDump
getJob
(
uint16_t
copyNb
);
std
::
list
<
JobDump
>
getJobs
();
std
::
list
<
JobDump
>
getJobs
();
bool
addJobFailure
(
uint16_t
copyNumber
,
uint64_t
mountId
,
log
::
LogContext
&
lc
);
/**< Returns true is the request is completely failed
bool
addJobFailure
(
uint16_t
copyNumber
,
uint64_t
mountId
,
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
);
/**< Returns true is the request is completely failed
(telling wheather we should requeue or not). */
(telling wheather we should requeue or not). */
struct
RetryStatus
{
struct
RetryStatus
{
uint64_t
retriesWithinMount
=
0
;
uint64_t
retriesWithinMount
=
0
;
...
@@ -73,6 +74,7 @@ public:
...
@@ -73,6 +74,7 @@ public:
uint64_t
maxTotalRetries
=
0
;
uint64_t
maxTotalRetries
=
0
;
};
};
RetryStatus
getRetryStatus
(
uint16_t
copyNumber
);
RetryStatus
getRetryStatus
(
uint16_t
copyNumber
);
std
::
list
<
std
::
string
>
getFailures
();
std
::
string
statusToString
(
const
serializers
::
RetrieveJobStatus
&
status
);
std
::
string
statusToString
(
const
serializers
::
RetrieveJobStatus
&
status
);
bool
finishIfNecessary
(
log
::
LogContext
&
lc
);
/**< Handling of the consequences of a job status change for the entire request.
bool
finishIfNecessary
(
log
::
LogContext
&
lc
);
/**< Handling of the consequences of a job status change for the entire request.
* This function returns true if the request got finished. */
* This function returns true if the request got finished. */
...
...
objectstore/cta.proto
View file @
fc963e95
...
@@ -315,6 +315,7 @@ message ArchiveJob {
...
@@ -315,6 +315,7 @@ message ArchiveJob {
required
uint64
lastmountwithfailure
=
4407
;
required
uint64
lastmountwithfailure
=
4407
;
required
uint32
maxtotalretries
=
4408
;
required
uint32
maxtotalretries
=
4408
;
required
uint32
maxretrieswithinmount
=
4409
;
required
uint32
maxretrieswithinmount
=
4409
;
repeated
string
failurelogs
=
4410
;
}
}
message
ArchiveRequest
{
message
ArchiveRequest
{
...
@@ -354,6 +355,7 @@ message RetrieveJob {
...
@@ -354,6 +355,7 @@ message RetrieveJob {
required
uint32
totalretries
=
9204
;
required
uint32
totalretries
=
9204
;
required
RetrieveJobStatus
status
=
9205
;
required
RetrieveJobStatus
status
=
9205
;
required
uint64
lastmountwithfailure
=
9206
;
required
uint64
lastmountwithfailure
=
9206
;
repeated
string
failurelogs
=
9207
;
}
}
message
RetrieveRequest
{
message
RetrieveRequest
{
...
...
scheduler/ArchiveJob.cpp
View file @
fc963e95
...
@@ -115,12 +115,12 @@ std::string cta::ArchiveJob::reportURL() {
...
@@ -115,12 +115,12 @@ std::string cta::ArchiveJob::reportURL() {
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// failed
// failed
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
void
cta
::
ArchiveJob
::
failed
(
const
cta
::
exception
::
Exception
&
ex
,
log
::
LogContext
&
lc
)
{
void
cta
::
ArchiveJob
::
failed
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
)
{
if
(
m_dbJob
->
fail
(
lc
))
{
if
(
m_dbJob
->
fail
(
failureReason
,
lc
))
{
std
::
string
base64ErrorReport
;
std
::
string
base64ErrorReport
;
// Construct a pipe: msg -> sign -> Base64 encode -> result goes into ret.
// Construct a pipe: msg -> sign -> Base64 encode -> result goes into ret.
const
bool
noNewLineInBase64Output
=
false
;
const
bool
noNewLineInBase64Output
=
false
;
CryptoPP
::
StringSource
ss1
(
ex
.
getMessageValue
()
,
true
,
CryptoPP
::
StringSource
ss1
(
failureReason
,
true
,
new
CryptoPP
::
Base64Encoder
(
new
CryptoPP
::
Base64Encoder
(
new
CryptoPP
::
StringSink
(
base64ErrorReport
),
noNewLineInBase64Output
));
new
CryptoPP
::
StringSink
(
base64ErrorReport
),
noNewLineInBase64Output
));
std
::
string
fullReportURL
=
m_dbJob
->
errorReportURL
+
base64ErrorReport
;
std
::
string
fullReportURL
=
m_dbJob
->
errorReportURL
+
base64ErrorReport
;
...
@@ -136,7 +136,7 @@ void cta::ArchiveJob::failed(const cta::exception::Exception &ex, log::LogConte
...
@@ -136,7 +136,7 @@ void cta::ArchiveJob::failed(const cta::exception::Exception &ex, log::LogConte
.
add
(
"diskInstance"
,
m_dbJob
->
archiveFile
.
diskInstance
)
.
add
(
"diskInstance"
,
m_dbJob
->
archiveFile
.
diskInstance
)
.
add
(
"diskFileId"
,
m_dbJob
->
archiveFile
.
diskFileId
)
.
add
(
"diskFileId"
,
m_dbJob
->
archiveFile
.
diskFileId
)
.
add
(
"fullReportURL"
,
fullReportURL
)
.
add
(
"fullReportURL"
,
fullReportURL
)
.
add
(
"errorReport"
,
ex
.
getMessageValue
()
)
.
add
(
"errorReport"
,
failureReason
)
.
add
(
"reportTime"
,
t
.
secs
());
.
add
(
"reportTime"
,
t
.
secs
());
lc
.
log
(
log
::
INFO
,
"In ArchiveJob::failed(): reported error to client."
);
lc
.
log
(
log
::
INFO
,
"In ArchiveJob::failed(): reported error to client."
);
}
catch
(
cta
::
exception
::
Exception
&
ex
)
{
}
catch
(
cta
::
exception
::
Exception
&
ex
)
{
...
@@ -144,7 +144,7 @@ void cta::ArchiveJob::failed(const cta::exception::Exception &ex, log::LogConte
...
@@ -144,7 +144,7 @@ void cta::ArchiveJob::failed(const cta::exception::Exception &ex, log::LogConte
params
.
add
(
"fileId"
,
m_dbJob
->
archiveFile
.
archiveFileID
)
params
.
add
(
"fileId"
,
m_dbJob
->
archiveFile
.
archiveFileID
)
.
add
(
"diskInstance"
,
m_dbJob
->
archiveFile
.
diskInstance
)
.
add
(
"diskInstance"
,
m_dbJob
->
archiveFile
.
diskInstance
)
.
add
(
"diskFileId"
,
m_dbJob
->
archiveFile
.
diskFileId
)
.
add
(
"diskFileId"
,
m_dbJob
->
archiveFile
.
diskFileId
)
.
add
(
"errorReport"
,
ex
.
getMessageValue
()
)
.
add
(
"errorReport"
,
failureReason
)
.
add
(
"exceptionMsg"
,
ex
.
getMessageValue
())
.
add
(
"exceptionMsg"
,
ex
.
getMessageValue
())
.
add
(
"reportTime"
,
t
.
secs
());
.
add
(
"reportTime"
,
t
.
secs
());
lc
.
log
(
log
::
ERR
,
"In ArchiveJob::failed(): failed to report error to client."
);
lc
.
log
(
log
::
ERR
,
"In ArchiveJob::failed(): failed to report error to client."
);
...
...
scheduler/ArchiveJob.hpp
View file @
fc963e95
...
@@ -102,16 +102,10 @@ public:
...
@@ -102,16 +102,10 @@ public:
virtual
catalogue
::
TapeFileWritten
validateAndGetTapeFileWritten
();
virtual
catalogue
::
TapeFileWritten
validateAndGetTapeFileWritten
();
/**
/**
* Triggers a scheduler update following the failure of the job.
* Triggers a scheduler update following the failure of the job. Retry policy will
* The reason for the failure should have been set beforehand by calling
* be applied by the scheduler.
* setFailureReason(), but failure to do it is non-fatal (a standard error
*/
* reason will be used)
virtual
void
failed
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
);
* This 2 step approach allows the reason to be recorded fast in the
* tape writing thread, and the slow(er) update of the DB to be executed
* in a second thread.
*
*/
virtual
void
failed
(
const
cta
::
exception
::
Exception
&
ex
,
log
::
LogContext
&
lc
);
/**
/**
* Get the URL used for reporting
* Get the URL used for reporting
...
...
scheduler/OStoreDB/OStoreDB.cpp
View file @
fc963e95
...
@@ -2703,14 +2703,14 @@ std::set<cta::SchedulerDatabase::ArchiveJob*> OStoreDB::ArchiveMount::setJobBatc
...
@@ -2703,14 +2703,14 @@ std::set<cta::SchedulerDatabase::ArchiveJob*> OStoreDB::ArchiveMount::setJobBatc
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// OStoreDB::ArchiveJob::fail()
// OStoreDB::ArchiveJob::fail()
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
bool
OStoreDB
::
ArchiveJob
::
fail
(
log
::
LogContext
&
lc
)
{
bool
OStoreDB
::
ArchiveJob
::
fail
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
)
{
if
(
!
m_jobOwned
)
if
(
!
m_jobOwned
)
throw
JobNowOwned
(
"In OStoreDB::ArchiveJob::fail: cannot fail a job not owned"
);
throw
JobNowOwned
(
"In OStoreDB::ArchiveJob::fail: cannot fail a job not owned"
);
// Lock the archive request. Fail the job.
// Lock the archive request. Fail the job.
objectstore
::
ScopedExclusiveLock
arl
(
m_archiveRequest
);
objectstore
::
ScopedExclusiveLock
arl
(
m_archiveRequest
);
m_archiveRequest
.
fetch
();
m_archiveRequest
.
fetch
();
// Add a job failure. If the job is failed, we will delete it.
// Add a job failure. If the job is failed, we will delete it.
if
(
m_archiveRequest
.
addJobFailure
(
tapeFile
.
copyNb
,
m_mountId
,
lc
))
{
if
(
m_archiveRequest
.
addJobFailure
(
tapeFile
.
copyNb
,
m_mountId
,
failureReason
,
lc
))
{
// The job will not be retried. Either another jobs for the same request is
// The job will not be retried. Either another jobs for the same request is
// queued and keeps the request referenced or the request has been deleted.
// queued and keeps the request referenced or the request has been deleted.
// In any case, we can forget it.
// In any case, we can forget it.
...
@@ -2819,14 +2819,14 @@ OStoreDB::RetrieveJob::RetrieveJob(const std::string& jobAddress, OStoreDB & oSt
...
@@ -2819,14 +2819,14 @@ OStoreDB::RetrieveJob::RetrieveJob(const std::string& jobAddress, OStoreDB & oSt
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// OStoreDB::RetrieveJob::fail()
// OStoreDB::RetrieveJob::fail()
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
bool
OStoreDB
::
RetrieveJob
::
fail
(
log
::
LogContext
&
logContext
)
{
bool
OStoreDB
::
RetrieveJob
::
fail
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
logContext
)
{
if
(
!
m_jobOwned
)
if
(
!
m_jobOwned
)
throw
JobNowOwned
(
"In OStoreDB::RetrieveJob::fail: cannot fail a job not owned"
);
throw
JobNowOwned
(
"In OStoreDB::RetrieveJob::fail: cannot fail a job not owned"
);
// Lock the retrieve request. Fail the job.
// Lock the retrieve request. Fail the job.
objectstore
::
ScopedExclusiveLock
rrl
(
m_retrieveRequest
);
objectstore
::
ScopedExclusiveLock
rrl
(
m_retrieveRequest
);
m_retrieveRequest
.
fetch
();
m_retrieveRequest
.
fetch
();
// Add a job failure. If the job is failed, we will delete it.
// Add a job failure. If the job is failed, we will delete it.
if
(
m_retrieveRequest
.
addJobFailure
(
selectedCopyNb
,
m_mountId
,
logContext
))
{
if
(
m_retrieveRequest
.
addJobFailure
(
selectedCopyNb
,
m_mountId
,
failureReason
,
logContext
))
{
// The job will not be retried. Either another jobs for the same request is
// The job will not be retried. Either another jobs for the same request is
// queued and keeps the request referenced or the request has been deleted.
// queued and keeps the request referenced or the request has been deleted.
// In any case, we can forget it.
// In any case, we can forget it.
...
@@ -2834,6 +2834,10 @@ bool OStoreDB::RetrieveJob::fail(log::LogContext &logContext) {
...
@@ -2834,6 +2834,10 @@ bool OStoreDB::RetrieveJob::fail(log::LogContext &logContext) {
m_jobOwned
=
false
;
m_jobOwned
=
false
;
log
::
ScopedParamContainer
params
(
logContext
);
log
::
ScopedParamContainer
params
(
logContext
);
params
.
add
(
"object"
,
m_retrieveRequest
.
getAddressIfSet
());
params
.
add
(
"object"
,
m_retrieveRequest
.
getAddressIfSet
());
size_t
failureNumber
=
0
;
for
(
auto
failure
:
m_retrieveRequest
.
getFailures
())
{
params
.
add
(
std
::
string
(
"failure"
)
+
std
::
to_string
(
failureNumber
++
),
failure
);
}
logContext
.
log
(
log
::
ERR
,
"In OStoreDB::RetrieveJob::fail(): request was definitely failed and deleted."
);
logContext
.
log
(
log
::
ERR
,
"In OStoreDB::RetrieveJob::fail(): request was definitely failed and deleted."
);
return
true
;
return
true
;
}
else
{
}
else
{
...
@@ -2881,7 +2885,7 @@ bool OStoreDB::RetrieveJob::fail(log::LogContext &logContext) {
...
@@ -2881,7 +2885,7 @@ bool OStoreDB::RetrieveJob::fail(log::LogContext &logContext) {
objectstore
::
ScopedExclusiveLock
rql
;
objectstore
::
ScopedExclusiveLock
rql
;
objectstore
::
Helpers
::
getLockedAndFetchedQueue
<
RetrieveQueue
>
(
rq
,
rql
,
*
m_oStoreDB
.
m_agentReference
,
bestVid
,
logContext
);
objectstore
::
Helpers
::
getLockedAndFetchedQueue
<
RetrieveQueue
>
(
rq
,
rql
,
*
m_oStoreDB
.
m_agentReference
,
bestVid
,
logContext
);
auto
rfqc
=
m_retrieveRequest
.
getRetrieveFileQueueCriteria
();
auto
rfqc
=
m_retrieveRequest
.
getRetrieveFileQueueCriteria
();
auto
&
af
=
rfqc
.
archiveFile
;
auto
&
af
=
rfqc
.
archiveFile
;
auto
&
tf
=
af
.
tapeFiles
.
at
(
bestCopyNb
);
auto
&
tf
=
af
.
tapeFiles
.
at
(
bestCopyNb
);
auto
sr
=
m_retrieveRequest
.
getSchedulerRequest
();
auto
sr
=
m_retrieveRequest
.
getSchedulerRequest
();
std
::
list
<
objectstore
::
RetrieveQueue
::
JobToAdd
>
jta
;
std
::
list
<
objectstore
::
RetrieveQueue
::
JobToAdd
>
jta
;
...
...
scheduler/OStoreDB/OStoreDB.hpp
View file @
fc963e95
...
@@ -168,7 +168,7 @@ public:
...
@@ -168,7 +168,7 @@ public:
public:
public:
CTA_GENERATE_EXCEPTION_CLASS
(
JobNowOwned
);
CTA_GENERATE_EXCEPTION_CLASS
(
JobNowOwned
);
CTA_GENERATE_EXCEPTION_CLASS
(
NoSuchJob
);
CTA_GENERATE_EXCEPTION_CLASS
(
NoSuchJob
);
bool
fail
(
log
::
LogContext
&
lc
)
override
;
bool
fail
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
)
override
;
private:
private:
void
asyncSucceed
();
void
asyncSucceed
();
bool
waitAsyncSucceed
();
bool
waitAsyncSucceed
();
...
@@ -215,7 +215,7 @@ public:
...
@@ -215,7 +215,7 @@ public:
CTA_GENERATE_EXCEPTION_CLASS
(
NoSuchJob
);
CTA_GENERATE_EXCEPTION_CLASS
(
NoSuchJob
);
virtual
void
asyncSucceed
()
override
;
virtual
void
asyncSucceed
()
override
;
virtual
void
checkSucceed
()
override
;
virtual
void
checkSucceed
()
override
;
virtual
bool
fail
(
log
::
LogContext
&
)
override
;
///< Returns true if this failure is final (we will not retry).
bool
fail
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
)
override
;
virtual
~
RetrieveJob
()
override
;
virtual
~
RetrieveJob
()
override
;
private:
private:
RetrieveJob
(
const
std
::
string
&
,
OStoreDB
&
,
RetrieveMount
&
);
RetrieveJob
(
const
std
::
string
&
,
OStoreDB
&
,
RetrieveMount
&
);
...
...
scheduler/RetrieveJob.cpp
View file @
fc963e95
...
@@ -61,12 +61,12 @@ void cta::RetrieveJob::checkComplete() {
...
@@ -61,12 +61,12 @@ void cta::RetrieveJob::checkComplete() {
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// failed
// failed
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
void
cta
::
RetrieveJob
::
failed
(
const
std
::
string
&
errorReport
,
log
::
LogContext
&
lc
)
{
void
cta
::
RetrieveJob
::
failed
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
)
{
if
(
m_dbJob
->
fail
(
lc
))
{
if
(
m_dbJob
->
fail
(
failureReason
,
lc
))
{
std
::
string
base64ErrorReport
;
std
::
string
base64ErrorReport
;
// Construct a pipe: msg -> sign -> Base64 encode -> result goes into ret.
// Construct a pipe: msg -> sign -> Base64 encode -> result goes into ret.
const
bool
noNewLineInBase64Output
=
false
;
const
bool
noNewLineInBase64Output
=
false
;
CryptoPP
::
StringSource
ss1
(
errorReport
,
true
,
CryptoPP
::
StringSource
ss1
(
failureReason
,
true
,
new
CryptoPP
::
Base64Encoder
(
new
CryptoPP
::
Base64Encoder
(
new
CryptoPP
::
StringSink
(
base64ErrorReport
),
noNewLineInBase64Output
));
new
CryptoPP
::
StringSink
(
base64ErrorReport
),
noNewLineInBase64Output
));
std
::
string
fullReportURL
=
m_dbJob
->
retrieveRequest
.
errorReportURL
+
base64ErrorReport
;
std
::
string
fullReportURL
=
m_dbJob
->
retrieveRequest
.
errorReportURL
+
base64ErrorReport
;
...
@@ -81,7 +81,7 @@ void cta::RetrieveJob::failed(const std::string & errorReport, log::LogContext &
...
@@ -81,7 +81,7 @@ void cta::RetrieveJob::failed(const std::string & errorReport, log::LogContext &
params
.
add
(
"fileId"
,
m_dbJob
->
archiveFile
.
archiveFileID
)
params
.
add
(
"fileId"
,
m_dbJob
->
archiveFile
.
archiveFileID
)
.
add
(
"diskInstance"
,
m_dbJob
->
archiveFile
.
diskInstance
)
.
add
(
"diskInstance"
,
m_dbJob
->
archiveFile
.
diskInstance
)
.
add
(
"diskFileId"
,
m_dbJob
->
archiveFile
.
diskFileId
)
.
add
(
"diskFileId"
,
m_dbJob
->
archiveFile
.
diskFileId
)
.
add
(
"errorReport"
,
errorReport
)
.
add
(
"errorReport"
,
failureReason
)
.
add
(
"reportTime"
,
t
.
secs
());
.
add
(
"reportTime"
,
t
.
secs
());
lc
.
log
(
log
::
INFO
,
"In RetrieveJob::failed(): reported error to client."
);
lc
.
log
(
log
::
INFO
,
"In RetrieveJob::failed(): reported error to client."
);
}
catch
(
cta
::
exception
::
Exception
&
ex
)
{
}
catch
(
cta
::
exception
::
Exception
&
ex
)
{
...
@@ -89,7 +89,7 @@ void cta::RetrieveJob::failed(const std::string & errorReport, log::LogContext &
...
@@ -89,7 +89,7 @@ void cta::RetrieveJob::failed(const std::string & errorReport, log::LogContext &
params
.
add
(
"fileId"
,
m_dbJob
->
archiveFile
.
archiveFileID
)
params
.
add
(
"fileId"
,
m_dbJob
->
archiveFile
.
archiveFileID
)
.
add
(
"diskInstance"
,
m_dbJob
->
archiveFile
.
diskInstance
)
.
add
(
"diskInstance"
,
m_dbJob
->
archiveFile
.
diskInstance
)
.
add
(
"diskFileId"
,
m_dbJob
->
archiveFile
.
diskFileId
)
.
add
(
"diskFileId"
,
m_dbJob
->
archiveFile
.
diskFileId
)
.
add
(
"errorReport"
,
errorReport
)
.
add
(
"errorReport"
,
failureReason
)
.
add
(
"exceptionMsg"
,
ex
.
getMessageValue
())
.
add
(
"exceptionMsg"
,
ex
.
getMessageValue
())
.
add
(
"reportTime"
,
t
.
secs
());
.
add
(
"reportTime"
,
t
.
secs
());
lc
.
log
(
log
::
ERR
,
"In RetrieveJob::failed(): failed to report error to client."
);
lc
.
log
(
log
::
ERR
,
"In RetrieveJob::failed(): failed to report error to client."
);
...
...
scheduler/RetrieveJob.hpp
View file @
fc963e95
...
@@ -88,11 +88,10 @@ public:
...
@@ -88,11 +88,10 @@ public:
virtual
void
checkComplete
();
virtual
void
checkComplete
();
/**
/**
* Indicates that the job failed. Like for complete(), reason for failure
* Indicates that the job failed. Reason for failure is indicated. Retry policy will
* should already be recorded in the object beforehand. Retry policy will
* be applied by the scheduler.
* be applied by the scheduler.
*/
*/
virtual
void
failed
(
const
std
::
string
&
errorReport
,
cta
::
log
::
LogContext
&
);
virtual
void
failed
(
const
std
::
string
&
failureReason
,
cta
::
log
::
LogContext
&
);
/**
/**
* Helper function returning a reference to the currently selected tape file.
* Helper function returning a reference to the currently selected tape file.
...
@@ -147,11 +146,6 @@ public:
...
@@ -147,11 +146,6 @@ public:
*/
*/
uint64_t
transferredSize
;
uint64_t
transferredSize
;
/**
* The error string. This should be set before calling failed().
*/
std
::
string
failureMessage
;
};
// class RetrieveJob
};
// class RetrieveJob
}
// namespace cta
}
// namespace cta
scheduler/SchedulerDatabase.hpp
View file @
fc963e95
...
@@ -183,7 +183,7 @@ public:
...
@@ -183,7 +183,7 @@ public:
std
::
string
errorReportURL
;
std
::
string
errorReportURL
;
cta
::
common
::
dataStructures
::
ArchiveFile
archiveFile
;
cta
::
common
::
dataStructures
::
ArchiveFile
archiveFile
;
cta
::
common
::
dataStructures
::
TapeFile
tapeFile
;
cta
::
common
::
dataStructures
::
TapeFile
tapeFile
;
virtual
bool
fail
(
log
::
LogContext
&
lc
)
=
0
;
virtual
bool
fail
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
)
=
0
;
virtual
void
bumpUpTapeFileCount
(
uint64_t
newFileCount
)
=
0
;
virtual
void
bumpUpTapeFileCount
(
uint64_t
newFileCount
)
=
0
;
virtual
~
ArchiveJob
()
{}
virtual
~
ArchiveJob
()
{}
};
};
...
@@ -334,7 +334,7 @@ public:
...
@@ -334,7 +334,7 @@ public:
uint64_t
selectedCopyNb
;
uint64_t
selectedCopyNb
;
virtual
void
asyncSucceed
()
=
0
;
virtual
void
asyncSucceed
()
=
0
;
virtual
void
checkSucceed
()
=
0
;
virtual
void
checkSucceed
()
=
0
;
virtual
bool
fail
(
log
::
LogContext
&
)
=
0
;
///< Returns true if this failure is final (we will not retry).
virtual
bool
fail
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
)
=
0
;
virtual
~
RetrieveJob
()
{}
virtual
~
RetrieveJob
()
{}
};
};
...
...
scheduler/SchedulerTest.cpp
View file @
fc963e95
...
@@ -620,7 +620,7 @@ TEST_P(SchedulerTest, retry_archive_until_max_reached) {
...
@@ -620,7 +620,7 @@ TEST_P(SchedulerTest, retry_archive_until_max_reached) {
ASSERT_NE
(
0
,
archiveJobList
.
size
());
ASSERT_NE
(
0
,
archiveJobList
.
size
());
// Validate we got the right file
// Validate we got the right file
ASSERT_EQ
(
archiveFileId
,
archiveJobList
.
front
()
->
archiveFile
.
archiveFileID
);
ASSERT_EQ
(
archiveFileId
,
archiveJobList
.
front
()
->
archiveFile
.
archiveFileID
);
archiveJobList
.
front
()
->
failed
(
cta
::
exception
::
Exception
(
"Archive failed"
)
,
lc
);
archiveJobList
.
front
()
->
failed
(
"Archive failed"
,
lc
);
}
}
// Then the request should be gone
// Then the request should be gone
ASSERT_EQ
(
0
,
archiveMount
->
getNextJobBatch
(
1
,
1
,
lc
).
size
());
ASSERT_EQ
(
0
,
archiveMount
->
getNextJobBatch
(
1
,
1
,
lc
).
size
());
...
...
scheduler/testingMocks/MockArchiveJob.hpp
View file @
fc963e95
...
@@ -33,8 +33,8 @@ namespace cta {
...
@@ -33,8 +33,8 @@ namespace cta {
completes
(
0
),
failures
(
0
)
{}
completes
(
0
),
failures
(
0
)
{}
~
MockArchiveJob
()
throw
()
{}
~
MockArchiveJob
()
throw
()
{}
void
failed
(
const
cta
::
exception
::
Exception
&
ex
,
log
::
LogContext
&
lc
)
override
{
void
failed
(
const
std
::
string
&
failureReason
,
log
::
LogContext
&
lc
)
override
{
failures
++
;
failures
++
;
}
}
...
...
scheduler/testingMocks/MockRetrieveJob.hpp
View file @
fc963e95
...
@@ -35,7 +35,7 @@ namespace cta {
...
@@ -35,7 +35,7 @@ namespace cta {
}
}
virtual
void
asyncComplete
()
override
{
completes
++
;
}
virtual
void
asyncComplete
()
override
{
completes
++
;
}
virtual
void
checkComplete
()
override
{}
virtual
void
checkComplete
()
override
{}
virtual
void
failed
(
const
std
::
string
&
errorReport
,
cta
::
log
::
LogContext
&
)
override
{
failures
++
;
};
void
failed
(
const
std
::
string
&
failureReason
,
cta
::
log
::
LogContext
&
)
override
{
failures
++
;
};
~
MockRetrieveJob
()
throw
()
{}
~
MockRetrieveJob
()
throw
()
{}
};
};
...
...
tapeserver/castor/tape/tapeserver/daemon/DiskReadTaskTest.cpp
View file @
fc963e95
...
@@ -51,11 +51,12 @@ namespace unitTests{
...
@@ -51,11 +51,12 @@ namespace unitTests{
using
namespace
castor
::
tape
::
diskFile
;
using
namespace
castor
::
tape
::
diskFile
;
struct
MockMigrationReportPacker
:
public
MigrationReportPacker
{
struct
MockMigrationReportPacker
:
public
MigrationReportPacker
{
void
reportCompletedJob
(
std
::
unique_ptr
<
cta
::
ArchiveJob
>
successfulArchiveJob
)
{}
void
reportCompletedJob
(
std
::
unique_ptr
<
cta
::
ArchiveJob
>
successfulArchiveJob
,
cta
::
log
::
LogContext
&
lc
)
override
{}
void
reportFailedJob
(
std
::
unique_ptr
<
cta
::
ArchiveJob
>
failedArchiveJob
,
const
cta
::
exception
::
Exception
&
ex
)
{}
void
reportFailedJob
(
std
::
unique_ptr
<
cta
::
ArchiveJob
>
failedArchiveJob
,
void
reportEndOfSession
()
{}
const
cta
::
exception
::
Exception
&
ex
,
cta
::
log
::
LogContext
&
lc
)
override
{}
void
reportEndOfSessionWithErrors
(
const
std
::
string
msg
,
int
error_code
)
{}
void
reportEndOfSession
(
cta
::
log
::
LogContext
&
lc
)
override
{}
void
disableBulk
()
{}
void
reportEndOfSessionWithErrors
(
const
std
::
string
msg
,
int
error_code
,
cta
::
log
::
LogContext
&
lc
)
override
{}
void
disableBulk
()
override
{}
MockMigrationReportPacker
(
cta
::
ArchiveMount
*
rm
,
cta
::
log
::
LogContext
lc
)
:
MockMigrationReportPacker
(
cta
::
ArchiveMount
*
rm
,
cta
::
log
::
LogContext
lc
)
:
MigrationReportPacker
(
rm
,
lc
)
{}
MigrationReportPacker
(
rm
,
lc
)
{}
};
};
...
...
tapeserver/castor/tape/tapeserver/daemon/DiskWriteTask.cpp
View file @
fc963e95
...
@@ -36,9 +36,7 @@ namespace daemon {
...
@@ -36,9 +36,7 @@ namespace daemon {
// constructor
// constructor
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------