Commit 492fe696 authored by Giuseppe Lo Presti's avatar Giuseppe Lo Presti
Browse files

Fixed bug #103625: Incorrect logic to clean up repack requests may lead

submitted requests to disappear.
The logic to clean up old requests has been changed: failed requests
are kept respecting the timeout in CastorConfig, successful requests
may be dropped earlier (as in the past) to keep the SubRequest table from
growing too much.
In consequence, the repack reporting has been adapted: the 'migrated'
count is now computed by difference with the other counters, while the
'failed' count can be trusted because of the above.
parent f1a6d684
......@@ -216,8 +216,9 @@ AS
LVL_DEBUG CONSTANT PLS_INTEGER := 7; /* LOG_DEBUG Debug-level messages */
/* messages */
FILE_DROPPED_BY_CLEANING CONSTANT VARCHAR2(2048) := 'File was dropped by internal cleaning';
PUTDONE_ENFORCED_BY_CLEANING CONSTANT VARCHAR2(2048) := 'PutDone enforced by internal cleaning';
FILE_DROPPED_BY_CLEANING CONSTANT VARCHAR2(2048) := 'deleteOutOfDateStageOutDCs: File was dropped by internal cleaning';
PUTDONE_ENFORCED_BY_CLEANING CONSTANT VARCHAR2(2048) := 'deleteOutOfDateStageOutDCs: PutDone enforced by internal cleaning';
DELETING_REQUESTS CONSTANT VARCHAR2(2048) := 'deleteTerminatedRequests: Cleaning up completed requests';
DBJOB_UNEXPECTED_EXCEPTION CONSTANT VARCHAR2(2048) := 'Unexpected exception caught in DB job';
......
......@@ -592,11 +592,10 @@ END;
/
/* A generic method to delete requests of a given type */
CREATE OR REPLACE Procedure bulkDeleteRequests(reqType IN VARCHAR, timeOut IN INTEGER) AS
CREATE OR REPLACE Procedure bulkDeleteRequests(reqType IN VARCHAR) AS
BEGIN
bulkDelete('SELECT id FROM '|| reqType ||' R WHERE
NOT EXISTS (SELECT 1 FROM SubRequest WHERE request = R.id) AND lastModificationTime < getTime() - '
|| timeOut ||';',
NOT EXISTS (SELECT 1 FROM SubRequest WHERE request = R.id);',
reqType);
END;
/
......@@ -604,31 +603,47 @@ END;
/* Search and delete old archived/failed subrequests and their requests */
CREATE OR REPLACE PROCEDURE deleteTerminatedRequests AS
timeOut INTEGER;
rateDependentTimeOut INTEGER;
rate INTEGER;
srIds "numList";
ct NUMBER;
BEGIN
-- select requested timeout from configuration table
timeout := 3600*TO_NUMBER(getConfigOption('cleaning', 'terminatedRequestsTimeout', '120'));
-- get a rough estimate of the current request processing rate
SELECT count(*) INTO rate
timeOut := 3600*TO_NUMBER(getConfigOption('cleaning', 'terminatedRequestsTimeout', '120'));
-- compute a rate-dependent timeout for the successful requests by looking at the
-- last half-hour of activity: keep max 1M of them regardless the above configured timeOut.
SELECT 1800 * 1000000 / (count(*)+1) INTO rateDependentTimeOut
FROM SubRequest
WHERE status IN (9, 11) -- FAILED_FINISHED, ARCHIVED
WHERE status = dconst.SUBREQUEST_ARCHIVED
AND lastModificationTime > getTime() - 1800;
IF rate > 0 AND (1000000 / rate * 1800) < timeOut THEN
timeOut := 1000000 / rate * 1800; -- keep 1M requests max
IF rateDependentTimeOut > timeOut THEN
rateDependentTimeOut := timeOut;
END IF;
-- delete castorFiles if nothing is left for them. Here we use
-- Delete castorFiles if nothing is left for them. Here we use
-- a temporary table as we need to commit every ~1000 operations
-- and keeping a cursor opened on the original select may take
-- too long, leading to ORA-01555 'snapshot too old' errors.
EXECUTE IMMEDIATE 'TRUNCATE TABLE DeleteTermReqHelper';
INSERT /*+ APPEND */ INTO DeleteTermReqHelper (srId, cfId)
(SELECT id, castorFile FROM SubRequest
WHERE status IN (9, 11)
AND lastModificationTime < getTime() - timeOut);
(SELECT SR.id, castorFile FROM SubRequest SR
WHERE (SR.status = dconst.SUBREQUEST_ARCHIVED
AND SR.lastModificationTime < getTime() - rateDependentTimeOut)
-- failed subrequests are kept according to the configured timeout
OR (SR.status = dconst.SUBREQUEST_FAILED_FINISHED
AND reqType != 119 AND SR.lastModificationTime < getTime() - timeOut)); -- StageRepackRequest
-- 2nd part, separated from above for efficiency reasons
INSERT /*+ APPEND */ INTO DeleteTermReqHelper (srId, cfId)
(SELECT SR.id, castorFile FROM SubRequest SR, StageRepackRequest R
WHERE SR.status = dconst.SUBREQUEST_FAILED_FINISHED
-- only for the Repack case, we keep all failed subrequests around until
-- the whole Repack request is over for more than <timeOut> seconds
AND reqType = 119 AND R.lastModificationTime < getTime() - timeOut -- StageRepackRequest
AND R.id = SR.request);
COMMIT;
SELECT count(*) INTO ct FROM DeleteTermReqHelper;
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.DELETING_REQUESTS, 0, '', 'stagerd',
'SubRequestsCount=' || ct);
ct := 0;
FOR cf IN (SELECT UNIQUE cfId FROM DeleteTermReqHelper) LOOP
deleteCastorFile(cf.cfId);
......@@ -639,7 +654,7 @@ BEGIN
END IF;
END LOOP;
-- now delete all old subRequest. We reuse here the
-- Now delete all old subRequests. We reuse here the
-- temporary table, which serves as a snapshot of the
-- entries to be deleted, and we use the FORALL logic
-- (cf. bulkDelete) instead of a simple DELETE ...
......@@ -666,30 +681,33 @@ BEGIN
EXECUTE IMMEDIATE 'TRUNCATE TABLE DeleteTermReqHelper';
-- And then related Requests, now orphaned.
-- The timeout makes sure we keep very recent requests,
-- even if they have no subrequests. This may actually
-- be the case only for Repack requests, as they're
-- created empty and filled after querying the NS.
---- Get ----
bulkDeleteRequests('StageGetRequest', timeOut);
bulkDeleteRequests('StageGetRequest');
---- Put ----
bulkDeleteRequests('StagePutRequest', timeOut);
bulkDeleteRequests('StagePutRequest');
---- Update ----
bulkDeleteRequests('StageUpdateRequest', timeOut);
bulkDeleteRequests('StageUpdateRequest');
---- PrepareToGet -----
bulkDeleteRequests('StagePrepareToGetRequest', timeOut);
bulkDeleteRequests('StagePrepareToGetRequest');
---- PrepareToPut ----
bulkDeleteRequests('StagePrepareToPutRequest', timeOut);
bulkDeleteRequests('StagePrepareToPutRequest');
---- PrepareToUpdate ----
bulkDeleteRequests('StagePrepareToUpdateRequest', timeOut);
bulkDeleteRequests('StagePrepareToUpdateRequest');
---- PutDone ----
bulkDeleteRequests('StagePutDoneRequest', timeOut);
bulkDeleteRequests('StagePutDoneRequest');
---- Rm ----
bulkDeleteRequests('StageRmRequest', timeOut);
---- Repack ----
bulkDeleteRequests('StageRepackRequest', timeOut);
bulkDeleteRequests('StageRmRequest');
---- SetGCWeight ----
bulkDeleteRequests('SetFileGCWeight', timeOut);
bulkDeleteRequests('SetFileGCWeight');
-- Finally deal with Repack: this case is different because StageRepackRequests may be empty
-- at the beginning. Therefore we only drop repacks that are in a completed state
-- for more than <timeOut> seconds. FINISHED, FAILED, ABORTED
bulkDelete('SELECT id FROM StageRepackRequest R WHERE status IN (2, 3, 5)
AND NOT EXISTS (SELECT 1 FROM SubRequest WHERE request = R.id)
AND lastModificationTime < getTime() - ' || timeOut,
'StageRepackRequest');
END;
/
......
......@@ -149,20 +149,20 @@ def _printStatusLine(subTime, lastModTime, user, machine, vid, status, total, si
if total > 0 and status != 'STARTING':
# Completion % takes into account each operation (recall and migration) as 1, so that e.g. a tape with half files recalled and none migrated counts as 25% done;
# no weight is given to recalls vs. migrations nor is the file size taken into account.
print '%-18s%10s%17s%29s%9s%11d%12s%10d%10d%10d%10d%7d%%%11s' % (subTime, repackTime, user, machine, vid, total, size, toRecall, toMigr, failed, migrated, 100-(2*toRecall+toMigr)*50.0/total, status)
print '%-18s%12s%17s%29s%9s%11d%12s%10d%10d%10d%10d%7d%%%11s' % (subTime, repackTime, user, machine, vid, total, size, toRecall, toMigr, failed, migrated, 100-(2*toRecall+toMigr)*50.0/total, status)
else:
print '%-18s%10s%17s%29s%9s%11d%12s%10d%10d%10d%10d N/A %11s' % (subTime, repackTime, user, machine, vid, total, size, toRecall, toMigr, failed, migrated, status)
print '%-18s%12s%17s%29s%9s%11d%12s%10d%10d%10d%10d N/A %11s' % (subTime, repackTime, user, machine, vid, total, size, toRecall, toMigr, failed, migrated, status)
except TypeError:
# case of reporting without details
print '%-18s%10s%17s%29s%9s%11d%12s%11s' % (subTime, repackTime, user, machine, vid, total, size, status)
print '%-18s%12s%17s%29s%9s%11d%12s%11s' % (subTime, repackTime, user, machine, vid, total, size, status)
def _displaySummary(rows):
'''prints a summary of all repack requests according to the given criteria'''
# print out results
print '='*117
print 'SubmitTime RepackTime User Machine Vid Total Size Status'
print '-'*117
print '='*119
print 'SubmitTime RepackTime User Machine Vid Total Size Status'
print '-'*119
sumTotal = sumSize = 0
globalStatus = 2 # FINISHED
shownVids = set()
......@@ -182,7 +182,7 @@ def _displaySummary(rows):
globalStatus = 0 # STARTING
if (status == 1 or status == 4) and globalStatus == 2:
globalStatus = 1 # ONGOING
print '-'*117
print '-'*119
# when showing full statistics, print out summary line
if not vids and sumTotal > 0:
_printStatusLine(time.time(), 0, '-', 'TOTAL', count, globalStatus, sumTotal, sumSize, '-', '-', '-', '-')
......@@ -190,18 +190,19 @@ def _displaySummary(rows):
def _displayDetailed(rows):
'''prints the details of all repack requests according to the given criteria'''
# print out results
print '='*165
print 'SubmitTime RepackTime User Machine Vid Total Size toRecall toMigr Failed Migrated Compl% Status'
print '-'*165
print '='*167
print 'SubmitTime RepackTime User Machine Vid Total Size toRecall toMigr Failed Migrated Compl% Status'
print '-'*167
sumTotal = sumSize = sumToRecall = sumToMigr = sumFailed = sumMigrated = 0
globalStatus = 2 # FINISHED
shownVids = set()
count = 0
for subTime, lastModTime, user, machine, vid, status, total, totsize, toRecall, toMigr, failed, migrated in rows:
for subTime, lastModTime, user, machine, vid, status, total, totsize, toRecall, toMigr, failed in rows:
if not vids and vid in shownVids:
# when showing full statistics skip the less recent requests for each VID
continue
shownVids.add(vid)
migrated = total - toRecall - toMigr - failed
_printStatusLine(subTime, lastModTime, user, machine, vid, status, total, totsize, toRecall, toMigr, failed, migrated)
sumTotal += total
if totsize > 0:
......@@ -216,7 +217,7 @@ def _displayDetailed(rows):
globalStatus = 0 # STARTING
if (status == 1 or status == 4) and globalStatus == 2:
globalStatus = 1 # ONGOING
print '-'*165
print '-'*167
# when showing full statistics, print out summary line
if not vids and sumTotal > 0:
_printStatusLine(time.time(), 0, '-', 'TOTAL', count, globalStatus, sumTotal, sumSize, sumToRecall, sumToMigr, sumFailed, sumMigrated)
......@@ -244,8 +245,7 @@ def displayStatus(detailed):
StageRepackRequest.fileCount total, StageRepackRequest.totalSize totsize,
SUM(CASE WHEN SubRequest.status IN (4, 5) THEN 1 ELSE 0 END) toRecall,
SUM(CASE SubRequest.status WHEN 12 THEN 1 ELSE 0 END) toMigr,
SUM(CASE WHEN SubRequest.status IN (7, 9) THEN 1 ELSE 0 END) failed,
SUM(CASE WHEN SubRequest.status IN (8, 11) THEN 1 ELSE 0 END) migrated
SUM(CASE WHEN SubRequest.status IN (7, 9) THEN 1 ELSE 0 END) failed
FROM StageRepackRequest, SubRequest
WHERE SubRequest.request(+) = StageRepackRequest.id
'''
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment