Commit f9119315 authored by Sebastien Ponce's avatar Sebastien Ponce
Browse files

Completed upgrade script after full DB diffing

parent 6bddb4c4
......@@ -95,6 +95,10 @@ ALTER TABLE DrainingErrors
FOREIGN KEY (castorFile)
REFERENCES CastorFile (id);
DELETE FROM CastorConfig WHERE class='cleaning' and key='terminatedRequestsTimeout';
INSERT INTO CastorConfig
VALUES ('cleaning', 'failedRequestsTimeout', '168', 'Maximum timeout before removing failed requests from the database in hours');
/* This procedure is used to check if the maxReplicaNb has been exceeded
* for some CastorFiles. It checks all the files listed in TooManyReplicasHelper
* This is called from a DB job and is fed by the tr_DiskCopy_Created trigger
......@@ -604,6 +608,1217 @@ BEGIN
END;
/
/**
* Package containing the definition of all DLF levels and messages logged from the SQL-to-DLF API
*/
CREATE OR REPLACE PACKAGE dlf
AS
/* message levels */
LVL_EMERGENCY CONSTANT PLS_INTEGER := 0; /* LOG_EMERG System is unusable */
LVL_ALERT CONSTANT PLS_INTEGER := 1; /* LOG_ALERT Action must be taken immediately */
LVL_CRIT CONSTANT PLS_INTEGER := 2; /* LOG_CRIT Critical conditions */
LVL_ERROR CONSTANT PLS_INTEGER := 3; /* LOG_ERR Error conditions */
LVL_WARNING CONSTANT PLS_INTEGER := 4; /* LOG_WARNING Warning conditions */
LVL_NOTICE CONSTANT PLS_INTEGER := 5; /* LOG_NOTICE Normal but significant condition */
LVL_USER_ERROR CONSTANT PLS_INTEGER := 5; /* LOG_NOTICE Normal but significant condition */
LVL_AUTH CONSTANT PLS_INTEGER := 5; /* LOG_NOTICE Normal but significant condition */
LVL_SECURITY CONSTANT PLS_INTEGER := 5; /* LOG_NOTICE Normal but significant condition */
LVL_SYSTEM CONSTANT PLS_INTEGER := 6; /* LOG_INFO Informational */
LVL_DEBUG CONSTANT PLS_INTEGER := 7; /* LOG_DEBUG Debug-level messages */
/* messages */
FILE_DROPPED_BY_CLEANING CONSTANT VARCHAR2(2048) := 'deleteOutOfDateStageOutDCs: File was dropped by internal cleaning';
PUTDONE_ENFORCED_BY_CLEANING CONSTANT VARCHAR2(2048) := 'deleteOutOfDateStageOutDCs: PutDone enforced by internal cleaning';
DELETING_REQUESTS CONSTANT VARCHAR2(2048) := 'deleteTerminatedRequests: Cleaning up completed requests';
DBJOB_UNEXPECTED_EXCEPTION CONSTANT VARCHAR2(2048) := 'Unexpected exception caught in DB job';
MIGMOUNT_NO_FILE CONSTANT VARCHAR2(2048) := 'startMigrationMounts: failed migration mount creation due to lack of files';
MIGMOUNT_AGE_NO_FILE CONSTANT VARCHAR2(2048) := 'startMigrationMounts: failed migration mount creation base on age due to lack of files';
MIGMOUNT_NEW_MOUNT CONSTANT VARCHAR2(2048) := 'startMigrationMounts: created new migration mount';
MIGMOUNT_NEW_MOUNT_AGE CONSTANT VARCHAR2(2048) := 'startMigrationMounts: created new migration mount based on age';
MIGMOUNT_NOACTION CONSTANT VARCHAR2(2048) := 'startMigrationMounts: no need for new migration mount';
RECMOUNT_NEW_MOUNT CONSTANT VARCHAR2(2048) := 'startRecallMounts: created new recall mount';
RECMOUNT_NOACTION_NODRIVE CONSTANT VARCHAR2(2048) := 'startRecallMounts: not allowed to start new recall mount. Maximum nb of drives has been reached';
RECMOUNT_NOACTION_NOCAND CONSTANT VARCHAR2(2048) := 'startRecallMounts: no candidate found for a mount';
RECALL_FOUND_ONGOING_RECALL CONSTANT VARCHAR2(2048) := 'createRecallCandidate: found already running recall';
RECALL_UNKNOWN_NS_ERROR CONSTANT VARCHAR2(2048) := 'createRecallCandidate: error when retrieving segments from namespace';
RECALL_NO_SEG_FOUND CONSTANT VARCHAR2(2048) := 'createRecallCandidate: no valid segment to recall found';
RECALL_NO_SEG_FOUND_AT_ALL CONSTANT VARCHAR2(2048) := 'createRecallCandidate: no segment found for this file. File is probably lost';
RECALL_INVALID_SEGMENT CONSTANT VARCHAR2(2048) := 'createRecallCandidate: found unusable segment';
RECALL_UNUSABLE_TAPE CONSTANT VARCHAR2(2048) := 'createRecallCandidate: found segment on unusable tape';
RECALL_CREATING_RECALLJOB CONSTANT VARCHAR2(2048) := 'createRecallCandidate: created new RecallJob';
RECALL_MISSING_COPIES CONSTANT VARCHAR2(2048) := 'createRecallCandidate: detected missing copies on tape';
RECALL_MISSING_COPIES_NOOP CONSTANT VARCHAR2(2048) := 'createRecallCandidate: detected missing copies on tape, but migrations ongoing';
RECALL_MJ_FOR_MISSING_COPY CONSTANT VARCHAR2(2048) := 'createRecallCandidate: create new MigrationJob to migrate missing copy';
RECALL_COPY_STILL_MISSING CONSTANT VARCHAR2(2048) := 'createRecallCandidate: could not find enough valid copy numbers to create missing copy';
RECALL_MISSING_COPY_NO_ROUTE CONSTANT VARCHAR2(2048) := 'createRecallCandidate: no route to tape defined for missing copy';
RECALL_MISSING_COPY_ERROR CONSTANT VARCHAR2(2048) := 'createRecallCandidate: unexpected error when creating missing copy';
RECALL_CANCEL_BY_VID CONSTANT VARCHAR2(2048) := 'Canceling tape recall for given VID';
RECALL_CANCEL_RECALLJOB_VID CONSTANT VARCHAR2(2048) := 'Canceling RecallJobs for given VID';
RECALL_FAILING CONSTANT VARCHAR2(2048) := 'Failing Recall(s)';
RECALL_FS_NOT_FOUND CONSTANT VARCHAR2(2048) := 'bestFileSystemForRecall could not find a suitable destination for this recall';
RECALL_LOOPING_ON_LOCK CONSTANT VARCHAR2(2048) := 'Giving up with recall as we are looping on locked file(s)';
RECALL_NOT_FOUND CONSTANT VARCHAR2(2048) := 'Unable to identify recall, giving up';
RECALL_INVALID_PATH CONSTANT VARCHAR2(2048) := 'setFileRecalled: unable to parse input path, giving up';
RECALL_COMPLETED_DB CONSTANT VARCHAR2(2048) := 'setFileRecalled: db updates after full recall completed';
RECALL_FILE_OVERWRITTEN CONSTANT VARCHAR2(2048) := 'setFileRecalled: file was overwritten during recall, restarting from scratch or skipping repack';
RECALL_FILE_DROPPED CONSTANT VARCHAR2(2048) := 'checkRecallInNS: file was dropped from namespace during recall, giving up';
RECALL_BAD_CHECKSUM CONSTANT VARCHAR2(2048) := 'checkRecallInNS: bad checksum detected, will retry if allowed';
RECALL_CREATED_CHECKSUM CONSTANT VARCHAR2(2048) := 'checkRecallInNS: created missing checksum in the namespace';
RECALL_FAILED CONSTANT VARCHAR2(2048) := 'setBulkFileRecallResult: recall process failed, will retry if allowed';
RECALL_PERMANENTLY_FAILED CONSTANT VARCHAR2(2048) := 'setFileRecalled: recall process failed permanently';
BULK_RECALL_COMPLETED CONSTANT VARCHAR2(2048) := 'setBulkFileRecallResult: bulk recall completed';
MIGRATION_CANCEL_BY_VID CONSTANT VARCHAR2(2048) := 'Canceling tape migration for given VID';
MIGRATION_COMPLETED CONSTANT VARCHAR2(2048) := 'setFileMigrated: db updates after full migration completed';
MIGRATION_NOT_FOUND CONSTANT VARCHAR2(2048) := 'Unable to identify migration, giving up';
MIGRATION_RETRY CONSTANT VARCHAR2(2048) := 'setBulkFilesMigrationResult: migration failed, will retry if allowed';
MIGRATION_FILE_DROPPED CONSTANT VARCHAR2(2048) := 'failFileMigration: file was dropped or modified during migration, giving up';
MIGRATION_SUPERFLUOUS_COPY CONSTANT VARCHAR2(2048) := 'failFileMigration: file already had enough copies on tape, ignoring new segment';
MIGRATION_FAILED CONSTANT VARCHAR2(2048) := 'failFileMigration: migration to tape failed for this file, giving up';
MIGRATION_FAILED_NOT_FOUND CONSTANT VARCHAR2(2048) := 'failFileMigration: file not found when failing migration';
BULK_MIGRATION_COMPLETED CONSTANT VARCHAR2(2048) := 'setBulkFilesMigrationResult: bulk migration completed';
REPACK_SUBMITTED CONSTANT VARCHAR2(2048) := 'New Repack request submitted';
REPACK_ABORTING CONSTANT VARCHAR2(2048) := 'Aborting Repack request';
REPACK_ABORTED CONSTANT VARCHAR2(2048) := 'Repack request aborted';
REPACK_ABORTED_FAILED CONSTANT VARCHAR2(2048) := 'Aborting Repack request failed, dropping it';
REPACK_JOB_ONGOING CONSTANT VARCHAR2(2048) := 'repackManager: Repack processes still starting, no new ones will be started for this round';
REPACK_STARTED CONSTANT VARCHAR2(2048) := 'repackManager: Repack process started';
REPACK_JOB_STATS CONSTANT VARCHAR2(2048) := 'repackManager: Repack processes statistics';
REPACK_UNEXPECTED_EXCEPTION CONSTANT VARCHAR2(2048) := 'handleRepackRequest: unexpected exception caught';
REPACK_COMPLETED CONSTANT VARCHAR2(2048) := 'Repack completed successfully';
REPACK_FAILED CONSTANT VARCHAR2(2048) := 'Repack ended with failures';
DRAINING_REFILL CONSTANT VARCHAR2(2048) := 'drainRunner: Creating new replication jobs';
DELETEDISKCOPY_RECALL CONSTANT VARCHAR2(2048) := 'deleteDiskCopy: diskCopy was lost, about to recall from tape';
DELETEDISKCOPY_REPLICATION CONSTANT VARCHAR2(2048) := 'deleteDiskCopy: diskCopy was lost, about to replicate from another pool';
DELETEDISKCOPY_LOST CONSTANT VARCHAR2(2048) := 'deleteDiskCopy: file was LOST and is being dropped from the system';
DELETEDISKCOPY_GC CONSTANT VARCHAR2(2048) := 'deleteDiskCopy: diskCopy is being garbage collected';
DELETEDISKCOPY_NOOP CONSTANT VARCHAR2(2048) := 'deleteDiskCopy: diskCopy could not be garbage collected';
STAGER_GET CONSTANT VARCHAR2(2048) := 'Get Request';
STAGER_PUT CONSTANT VARCHAR2(2048) := 'Put Request';
STAGER_UPDATE CONSTANT VARCHAR2(2048) := 'Update Request';
STAGER_PREPARETOGET CONSTANT VARCHAR2(2048) := 'PrepareToGet Request';
STAGER_PREPARETOPUT CONSTANT VARCHAR2(2048) := 'PrepareToPut Request';
STAGER_PREPARETOUPDATE CONSTANT VARCHAR2(2048) := 'PrepareToUpdate Request';
STAGER_D2D_TRIGGERED CONSTANT VARCHAR2(2048) := 'Triggering DiskCopy replication';
STAGER_WAITSUBREQ CONSTANT VARCHAR2(2048) := 'Request moved to Wait';
STAGER_UNABLETOPERFORM CONSTANT VARCHAR2(2048) := 'Unable to perform request, notifying user';
STAGER_RECREATION_IMPOSSIBLE CONSTANT VARCHAR2(2048) := 'Impossible to recreate CastorFile';
STAGER_CASTORFILE_RECREATION CONSTANT VARCHAR2(2048) := 'Recreating CastorFile';
STAGER_GET_REPLICATION CONSTANT VARCHAR2(2048) := 'Triggering internal DiskCopy replication';
STAGER_GET_REPLICATION_FAIL CONSTANT VARCHAR2(2048) := 'Triggering internal DiskCopy replication failed';
STAGER_DISKCOPY_FOUND CONSTANT VARCHAR2(2048) := 'Available DiskCopy found';
REPORT_HEART_BEAT_RESUMED CONSTANT VARCHAR2(2048) := 'Heartbeat resumed for diskserver, status changed to PRODUCTION';
D2D_CREATING_JOB CONSTANT VARCHAR2(2048) := 'Created new Disk2DiskCopyJob';
D2D_CANCELED_AT_START CONSTANT VARCHAR2(2048) := 'disk2DiskCopyStart : Replication request canceled while queuing in scheduler or transfer already started';
D2D_MULTIPLE_COPIES_ON_DS CONSTANT VARCHAR2(2048) := 'disk2DiskCopyStart : Multiple copies of this file already found on this diskserver';
D2D_SOURCE_GONE CONSTANT VARCHAR2(2048) := 'disk2DiskCopyStart : Source has disappeared while queuing in scheduler, retrying';
D2D_SRC_DISABLED CONSTANT VARCHAR2(2048) := 'disk2DiskCopyStart : Source diskserver/filesystem was DISABLED meanwhile';
D2D_DEST_NOT_PRODUCTION CONSTANT VARCHAR2(2048) := 'disk2DiskCopyStart : Destination diskserver/filesystem not in PRODUCTION any longer';
D2D_START_OK CONSTANT VARCHAR2(2048) := 'disk2DiskCopyStart called and returned successfully';
D2D_D2DDONE_CANCEL CONSTANT VARCHAR2(2048) := 'disk2DiskCopyEnded : Invalidating new copy as job was canceled';
D2D_D2DDONE_BADSIZE CONSTANT VARCHAR2(2048) := 'disk2DiskCopyEnded : File replication size mismatch';
D2D_D2DDONE_OK CONSTANT VARCHAR2(2048) := 'disk2DiskCopyEnded : Replication successful';
D2D_D2DDONE_RETRIED CONSTANT VARCHAR2(2048) := 'disk2DiskCopyEnded : Retrying disk to disk copy';
D2D_D2DDONE_NORETRY CONSTANT VARCHAR2(2048) := 'disk2DiskCopyEnded : no retry, giving up';
D2D_D2DFAILED CONSTANT VARCHAR2(2048) := 'disk2DiskCopyEnded : replication failed';
REBALANCING_START CONSTANT VARCHAR2(2048) := 'rebalancing : starting';
REBALANCING_STOP CONSTANT VARCHAR2(2048) := 'rebalancing : stopping';
END dlf;
/
/* fail recall of a given CastorFile for a non existing tape */
CREATE OR REPLACE PROCEDURE cancelRecallForCFAndVID(inCfId IN INTEGER,
inVID IN VARCHAR2,
inErrorCode IN INTEGER,
inErrorMsg IN VARCHAR2) AS
PRAGMA AUTONOMOUS_TRANSACTION;
varNbRecalls INTEGER;
varFileId INTEGER;
varNsHost VARCHAR2(2048);
BEGIN
-- lock castorFile, skip if it's missing
-- (it may have disappeared in the mean time as we held no lock)
BEGIN
SELECT fileid, nsHost INTO varFileId, varNsHost
FROM CastorFile
WHERE id = inCfId
FOR UPDATE;
EXCEPTION
WHEN NO_DATA_FOUND THEN RETURN;
END;
-- log "Canceling RecallJobs for given VID"
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.RECALL_CANCEL_RECALLJOB_VID, varFileId, varNsHost, 'tapegatewayd',
'errorCode=' || TO_CHAR(inErrorCode) ||
' errorMessage="' || inErrorMsg ||
'" TPVID=' || inVID);
-- remove recallJobs that need the non existing tape
DELETE FROM RecallJob WHERE castorfile = inCfId AND VID=inVID;
-- check if other recallJobs remain (typically dual copy tapes)
SELECT /*+ INDEX_RS_ASC(RecallJob I_RecallJob_CastorFile_VID) */
count(*) INTO varNbRecalls
FROM RecallJob WHERE castorfile = inCfId;
-- if no remaining recalls, fail requests and cleanup
IF varNbRecalls = 0 THEN
-- log "Failing Recall(s)"
logToDLF(NULL, dlf.LVL_ERROR, dlf.RECALL_FAILING, varFileId, varNsHost, 'tapegatewayd',
'errorCode=' || TO_CHAR(inErrorCode) ||
' errorMessage="' || inErrorMsg ||
'" TPVID=' || inVID);
-- delete potential migration jobs waiting on recalls
deleteMigrationJobsForRecall(inCfId);
-- Fail the associated subrequest(s)
UPDATE /*+ INDEX_RS_ASC(SR I_Subrequest_Castorfile)*/ SubRequest SR
SET SR.status = dconst.SUBREQUEST_FAILED,
SR.getNextStatus = dconst.GETNEXTSTATUS_FILESTAGED, -- (not strictly correct but the request is over anyway)
SR.lastModificationTime = getTime(),
SR.errorCode = serrno.SEINTERNAL,
SR.errorMessage = 'File recall from tape has failed (tape not available), please try again later'
WHERE SR.castorFile = inCfId
AND SR.status IN (dconst.SUBREQUEST_WAITTAPERECALL, dconst.SUBREQUEST_WAITSUBREQ);
END IF;
COMMIT;
END;
/
/* Search and delete old diskCopies in bad states */
CREATE OR REPLACE PROCEDURE deleteFailedDiskCopies(timeOut IN NUMBER) AS
dcIds "numList";
cfIds "numList";
BEGIN
LOOP
-- select INVALID diskcopies without filesystem (they can exist after a
-- stageRm that came before the diskcopy had been created on disk) and ALL FAILED
-- ones (coming from failed recalls or failed removals from the GC daemon).
-- Note that we don't select INVALID diskcopies from recreation of files
-- because they are taken by the standard GC as they physically exist on disk.
-- go only for 1000 at a time and retry if the limit was reached
SELECT id
BULK COLLECT INTO dcIds
FROM DiskCopy
WHERE (status = 4 OR (status = 7 AND fileSystem = 0))
AND creationTime < getTime() - timeOut
AND ROWNUM <= 1000;
SELECT /*+ INDEX(DC PK_DiskCopy_ID) */ UNIQUE castorFile
BULK COLLECT INTO cfIds
FROM DiskCopy DC
WHERE id IN (SELECT /*+ CARDINALITY(ids 5) */ * FROM TABLE(dcIds) ids);
-- drop the DiskCopies - not in bulk because of the constraint violation check
FOR i IN 1 .. dcIds.COUNT LOOP
DECLARE
CONSTRAINT_VIOLATED EXCEPTION;
PRAGMA EXCEPTION_INIT(CONSTRAINT_VIOLATED, -1);
BEGIN
DELETE FROM DiskCopy WHERE id = dcIds(i);
EXCEPTION WHEN CONSTRAINT_VIOLATED THEN
IF sqlerrm LIKE '%constraint (CASTOR_STAGER.FK_DRAININGERRORS_CASTORFILE) violated%' OR
sqlerrm LIKE '%constraint (CASTOR_STAGER.FK_DISK2DISKCOPYJOB_SRCDCID) violated%' THEN
-- Ignore the deletion, this diskcopy was either implied in a draining action and
-- the draining error is still around or it is the source of another d2d copy that
-- is not over
NULL;
ELSE
-- Any other constraint violation is an error
RAISE;
END IF;
END;
END LOOP;
COMMIT;
-- maybe delete the CastorFiles if nothing is left for them
FOR i IN 1 .. cfIds.COUNT LOOP
deleteCastorFile(cfIds(i));
END LOOP;
COMMIT;
-- exit if we did less than 1000
IF dcIds.COUNT < 1000 THEN EXIT; END IF;
END LOOP;
END;
/
/* Search and delete old archived/failed subrequests and their requests */
CREATE OR REPLACE PROCEDURE deleteTerminatedRequests AS
failuresTimeOut INTEGER;
successesTimeOut INTEGER;
rate INTEGER;
srIds "numList";
ct NUMBER;
BEGIN
-- select requested timeout for failed requests from configuration table
failuresTimeOut := 3600*TO_NUMBER(getConfigOption('cleaning', 'failedRequestsTimeout', '168')); -- 1 week
-- compute a rate-dependent timeout for the successful requests by looking at the
-- last half-hour of activity: keep max 1M of them.
SELECT 1800 * 1000000 / (count(*)+1) INTO successesTimeOut
FROM SubRequest
WHERE status = dconst.SUBREQUEST_ARCHIVED
AND lastModificationTime > getTime() - 1800;
IF successesTimeOut > failuresTimeOut THEN
-- in case of light load, don't keep successful request for longer than failed ones
successesTimeOut := failuresTimeOut;
END IF;
-- Delete castorFiles if nothing is left for them. Here we use
-- a temporary table as we need to commit every ~1000 operations
-- and keeping a cursor opened on the original select may take
-- too long, leading to ORA-01555 'snapshot too old' errors.
EXECUTE IMMEDIATE 'TRUNCATE TABLE DeleteTermReqHelper';
INSERT /*+ APPEND */ INTO DeleteTermReqHelper (srId, cfId)
(SELECT SR.id, castorFile FROM SubRequest SR
WHERE (SR.status = dconst.SUBREQUEST_ARCHIVED
AND SR.lastModificationTime < getTime() - successesTimeOut)
-- failed subrequests are kept according to the configured timeout
OR (SR.status = dconst.SUBREQUEST_FAILED_FINISHED
AND reqType != 119 AND SR.lastModificationTime < getTime() - failuresTimeOut)); -- StageRepackRequest
COMMIT; -- needed otherwise the next statement raises
-- ORA-12838: cannot read/modify an object after modifying it in parallel
-- 2nd part, separated from above for efficiency reasons
INSERT /*+ APPEND */ INTO DeleteTermReqHelper (srId, cfId)
(SELECT SR.id, castorFile FROM SubRequest SR, StageRepackRequest R
WHERE SR.status = dconst.SUBREQUEST_FAILED_FINISHED
-- only for the Repack case, we keep all failed subrequests around until
-- the whole Repack request is over for more than <timeOut> seconds
AND reqType = 119 AND R.lastModificationTime < getTime() - failuresTimeOut -- StageRepackRequest
AND R.id = SR.request);
COMMIT;
SELECT count(*) INTO ct FROM DeleteTermReqHelper;
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.DELETING_REQUESTS, 0, '', 'stagerd',
'SubRequestsCount=' || ct);
ct := 0;
FOR cf IN (SELECT UNIQUE cfId FROM DeleteTermReqHelper) LOOP
deleteCastorFile(cf.cfId);
ct := ct + 1;
IF ct = 1000 THEN
COMMIT;
ct := 0;
END IF;
END LOOP;
-- Now delete all old subRequests. We reuse here the
-- temporary table, which serves as a snapshot of the
-- entries to be deleted, and we use the FORALL logic
-- (cf. bulkDelete) instead of a simple DELETE ...
-- WHERE id IN (SELECT srId FROM DeleteTermReqHelper)
-- for efficiency reasons. Moreover, we don't risk
-- here the ORA-01555 error keeping the cursor open
-- between commits as we are selecting on our
-- temporary table.
DECLARE
CURSOR s IS
SELECT srId FROM DeleteTermReqHelper;
ids "numList";
BEGIN
OPEN s;
LOOP
FETCH s BULK COLLECT INTO ids LIMIT 10000;
EXIT WHEN ids.count = 0;
FORALL i IN 1 .. ids.COUNT
DELETE FROM SubRequest WHERE id = ids(i);
COMMIT;
END LOOP;
CLOSE s;
END;
EXECUTE IMMEDIATE 'TRUNCATE TABLE DeleteTermReqHelper';
-- And then related Requests, now orphaned.
---- Get ----
bulkDeleteRequests('StageGetRequest');
---- Put ----
bulkDeleteRequests('StagePutRequest');
---- Update ----
bulkDeleteRequests('StageUpdateRequest');
---- PrepareToGet -----
bulkDeleteRequests('StagePrepareToGetRequest');
---- PrepareToPut ----
bulkDeleteRequests('StagePrepareToPutRequest');
---- PrepareToUpdate ----
bulkDeleteRequests('StagePrepareToUpdateRequest');
---- PutDone ----
bulkDeleteRequests('StagePutDoneRequest');
---- Rm ----
bulkDeleteRequests('StageRmRequest');
---- SetGCWeight ----
bulkDeleteRequests('SetFileGCWeight');
-- Finally deal with Repack: this case is different because StageRepackRequests may be empty
-- at the beginning. Therefore we only drop repacks that are in a completed state
-- for more than the requested time.
-- First failed ones (status FAILED, ABORTED)
bulkDelete('SELECT id FROM StageRepackRequest R WHERE status IN (3, 5)
AND NOT EXISTS (SELECT 1 FROM SubRequest WHERE request = R.id)
AND lastModificationTime < getTime() - ' || failuresTimeOut || ';',
'StageRepackRequest');
-- Then successful ones (status FINISHED)
bulkDelete('SELECT id FROM StageRepackRequest R WHERE status = 2
AND NOT EXISTS (SELECT 1 FROM SubRequest WHERE request = R.id)
AND lastModificationTime < getTime() - ' || successesTimeOut || ';',
'StageRepackRequest');
END;
/
/* Procedure responsible for managing the draining process
*/
CREATE OR REPLACE PROCEDURE drainManager AS
varTFiles INTEGER;
varTBytes INTEGER;
BEGIN
-- Delete the COMPLETED jobs older than 7 days
DELETE FROM DrainingJob
WHERE status = dconst.DRAININGJOB_FINISHED
AND lastModificationTime < getTime() - (7 * 86400);
COMMIT;
-- Start new DrainingJobs if needed
FOR dj IN (SELECT id, fileSystem, fileMask
FROM DrainingJob WHERE status = dconst.DRAININGJOB_SUBMITTED) LOOP
UPDATE DrainingJob SET status = dconst.DRAININGJOB_STARTING WHERE id = dj.id;
COMMIT;
-- Compute totals now. Jobs will be later added in bunches by drainRunner
SELECT count(*), SUM(diskCopySize) INTO varTFiles, varTBytes
FROM DiskCopy, CastorFile
WHERE fileSystem = dj.fileSystem
AND status = dconst.DISKCOPY_VALID
AND CastorFile.id = DiskCopy.castorFile
AND ((dj.fileMask = dconst.DRAIN_FILEMASK_NOTONTAPE AND
CastorFile.tapeStatus IN (dconst.CASTORFILE_NOTONTAPE, dconst.CASTORFILE_DISKONLY)) OR
(dj.fileMask = dconst.DRAIN_FILEMASK_ALL));
UPDATE DrainingJob
SET totalFiles = varTFiles,
totalBytes = nvl(varTBytes, 0),
status = decode(varTBytes, NULL, dconst.DRAININGJOB_FINISHED, dconst.DRAININGJOB_RUNNING)
WHERE id = dj.id;
COMMIT;
END LOOP;
END;
/
/* Fail a file migration, potentially archiving outstanding repack requests */
CREATE OR REPLACE PROCEDURE failFileMigration(inMountTrId IN NUMBER, inFileId IN NUMBER,
inErrorCode IN INTEGER, inReqId IN VARCHAR2) AS
varNsHost VARCHAR2(2048);
varCfId NUMBER;
varNsOpenTime NUMBER;
varSrIds "numList";
varOriginalCopyNb NUMBER;
varMigJobCount NUMBER;
varErrorCode INTEGER := inErrorCode;
BEGIN
varNsHost := getConfigOption('stager', 'nsHost', '');
-- Lock castor file
SELECT id, nsOpenTime INTO varCfId, varNsOpenTime
FROM CastorFile WHERE fileId = inFileId FOR UPDATE;
-- delete migration job
DELETE FROM MigrationJob
WHERE castorFile = varCFId AND mountTransactionId = inMountTrId
RETURNING originalCopyNb INTO varOriginalCopyNb;
-- check if another migration should be performed
SELECT /*+ INDEX_RS_ASC(MigrationJob I_MigrationJob_CFVID) */
count(*) INTO varMigJobCount
FROM MigrationJob
WHERE castorfile = varCfId;
IF varMigJobCount = 0 THEN
-- no other migration, delete all migrated segments
DELETE FROM MigratedSegment
WHERE castorfile = varCfId;
END IF;
-- terminate repack subrequests
IF varOriginalCopyNb IS NOT NULL THEN
archiveOrFailRepackSubreq(varCfId, inErrorCode);
END IF;
IF varErrorCode = serrno.ENOENT THEN
-- unfortunately, tape servers can throw this error too (see SR #136759), so we have to double check
-- prior to taking destructive actions on the file: if the file does exist in the Nameserver, then
-- replace the error code to a generic ETSYS (taped system error), otherwise keep ENOENT
BEGIN
SELECT 1902 INTO varErrorCode FROM Dual
WHERE EXISTS (SELECT 1 FROM Cns_file_metadata@RemoteNS WHERE fileid = inFileId);
EXCEPTION WHEN NO_DATA_FOUND THEN
NULL;
END;
END IF;
-- Log depending on the error: some are not pathological and have dedicated handling
IF varErrorCode = serrno.ENOENT OR varErrorCode = serrno.ENSFILECHG OR varErrorCode = serrno.ENSNOSEG THEN
-- in this case, disk cache is stale
UPDATE DiskCopy SET status = dconst.DISKCOPY_INVALID
WHERE status = dconst.DISKCOPY_VALID
AND castorFile = varCfId;
-- cleanup other migration jobs for that file if any
DELETE FROM MigrationJob WHERE castorfile = varCfId;
-- Log 'file was dropped or modified during migration, giving up'
logToDLF(inReqid, dlf.LVL_NOTICE, dlf.MIGRATION_FILE_DROPPED, inFileId, varNsHost, 'tapegatewayd',
'mountTransactionId=' || inMountTrId || ' ErrorCode=' || varErrorCode ||
' NsOpenTimeAtStager=' || trunc(varNsOpenTime, 6));
ELSIF varErrorCode = serrno.ENSTOOMANYSEGS THEN
-- do as if migration was successful
UPDATE CastorFile SET tapeStatus = dconst.CASTORFILE_ONTAPE WHERE id = varCfId;
-- Log 'file already had enough copies on tape, ignoring new segment'
logToDLF(inReqid, dlf.LVL_NOTICE, dlf.MIGRATION_SUPERFLUOUS_COPY, inFileId, varNsHost, 'tapegatewayd',
'mountTransactionId=' || inMountTrId);
ELSE
-- Any other case, log 'migration to tape failed for this file, giving up'
logToDLF(inReqid, dlf.LVL_ERROR, dlf.MIGRATION_FAILED, inFileId, varNsHost, 'tapegatewayd',
'mountTransactionId=' || inMountTrId || ' LastErrorCode=' || varErrorCode);
END IF;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- File was dropped, log 'file not found when failing migration'
logToDLF(inReqid, dlf.LVL_ERROR, dlf.MIGRATION_FAILED_NOT_FOUND, inFileId, varNsHost, 'tapegatewayd',
'mountTransactionId=' || inMountTrId || ' LastErrorCode=' || varErrorCode);
END;
/
/* PL/SQL method implementing replicateOnClose */
CREATE OR REPLACE PROCEDURE replicateOnClose(cfId IN NUMBER, ouid IN INTEGER, ogid IN INTEGER) AS
varNsOpenTime NUMBER;
srcSvcClassId NUMBER;
ignoreSvcClass NUMBER;
BEGIN
-- Lock the castorfile and take the nsOpenTime
SELECT nsOpenTime INTO varNsOpenTime FROM CastorFile WHERE id = cfId FOR UPDATE;
-- Loop over all service classes where replication is required
FOR a IN (SELECT SvcClass.id FROM (
-- Determine the number of copies of the file in all service classes
SELECT * FROM (
SELECT /*+ INDEX_RS_ASC(DiskCopy I_DiskCopy_CastorFile) */
SvcClass.id, count(*) available
FROM DiskCopy, FileSystem, DiskServer, DiskPool2SvcClass, SvcClass
WHERE DiskCopy.filesystem = FileSystem.id
AND DiskCopy.castorfile = cfId
AND FileSystem.diskpool = DiskPool2SvcClass.parent
AND DiskPool2SvcClass.child = SvcClass.id
AND DiskCopy.status = dconst.DISKCOPY_VALID
AND FileSystem.status IN
(dconst.FILESYSTEM_PRODUCTION, dconst.FILESYSTEM_DRAINING, dconst.FILESYSTEM_READONLY)
AND DiskServer.id = FileSystem.diskserver
AND DiskServer.status IN
(dconst.DISKSERVER_PRODUCTION, dconst.DISKSERVER_DRAINING, dconst.DISKSERVER_READONLY)
GROUP BY SvcClass.id)
) results, SvcClass
-- Join the results with the service class table and determine if
-- additional copies need to be created
WHERE results.id = SvcClass.id
AND SvcClass.replicateOnClose = 1
AND results.available < SvcClass.maxReplicaNb)
LOOP
BEGIN
-- Trigger a replication request.
createDisk2DiskCopyJob(cfId, varNsOpenTime, a.id, ouid, ogid, dconst.REPLICATIONTYPE_USER, NULL, FALSE, NULL, TRUE);
EXCEPTION WHEN NO_DATA_FOUND THEN
NULL; -- No copies to replicate from
END;
END LOOP;
END;
/
/* DB job to start new recall mounts */
CREATE OR REPLACE PROCEDURE startRecallMounts AS
varNbMounts INTEGER;
varNbExtraMounts INTEGER := 0;
BEGIN
-- loop through RecallGroups
FOR rg IN (SELECT id, name, nbDrives, minAmountDataForMount,
minNbFilesForMount, maxFileAgeBeforeMount
FROM RecallGroup
ORDER BY vdqmPriority DESC) LOOP
-- get number of mounts already running for this recallGroup
SELECT COUNT(*) INTO varNbMounts
FROM RecallMount
WHERE recallGroup = rg.id;
-- check whether some tapes should be mounted
IF varNbMounts < rg.nbDrives THEN
DECLARE
varVID VARCHAR2(2048);
varDataAmount INTEGER;
varNbFiles INTEGER;
varOldestCreationTime NUMBER;
BEGIN
-- loop over the best candidates until we have enough mounts
WHILE varNbMounts + varNbExtraMounts < rg.nbDrives LOOP
SELECT * INTO varVID, varDataAmount, varNbFiles, varOldestCreationTime FROM (
SELECT vid, SUM(fileSize) dataAmount, COUNT(*) nbFiles, MIN(creationTime)
FROM RecallJob
WHERE recallGroup = rg.id
AND status = tconst.RECALLJOB_PENDING
GROUP BY vid
HAVING (SUM(fileSize) >= rg.minAmountDataForMount OR
COUNT(*) >= rg.minNbFilesForMount OR
gettime() - MIN(creationTime) > rg.maxFileAgeBeforeMount)
AND VID NOT IN (SELECT vid FROM RecallMount)
ORDER BY MIN(creationTime))
WHERE ROWNUM < 2;
-- trigger a new mount
INSERT INTO RecallMount (id, VID, recallGroup, startTime, status)
VALUES (ids_seq.nextval, varVid, rg.id, gettime(), tconst.RECALLMOUNT_NEW);
varNbExtraMounts := varNbExtraMounts + 1;
-- log "startRecallMounts: created new recall mount"
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.RECMOUNT_NEW_MOUNT, 0, '', 'tapegatewayd',
'recallGroup=' || rg.name ||
' TPVID=' || varVid ||
' nbExistingMounts=' || TO_CHAR(varNbMounts) ||
' nbNewMountsSoFar=' || TO_CHAR(varNbExtraMounts) ||
' dataAmountInQueue=' || TO_CHAR(varDataAmount) ||
' nbFilesInQueue=' || TO_CHAR(varNbFiles) ||
' oldestCreationTime=' || TO_CHAR(TRUNC(varOldestCreationTime)));
END LOOP;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- nothing left to recall, just exit nicely
NULL;
END;
IF varNbExtraMounts = 0 THEN
-- log "startRecallMounts: no candidate found for a mount"
logToDLF(NULL, dlf.LVL_DEBUG, dlf.RECMOUNT_NOACTION_NOCAND, 0, '',
'tapegatewayd', 'recallGroup=' || rg.name);