Commit b82f8897 authored by Giuseppe Lo Presti's avatar Giuseppe Lo Presti Committed by Sebastien Ponce
Browse files

Fixed bug #103387: Incorrect clean up of Disk2diskCopyJobs when they are

cancelled
parent e9d1314a
......@@ -5,9 +5,10 @@
Summary
-------
This is a hot fix release that fixes the following two bugs:
This is a hot fix release that fixes the following bugs:
- #103363: Creation of tape mounts for migration is not resilient to hardware unavailability
- #103370: The logic to resume recall jobs after an unmount is broken when dealing with double copy recalls
- #103387: Incorrect clean up of Disk2diskCopyJobs when they are cancelled
Upgrade Instructions from 2.1.14-5
----------------------------------
......
......@@ -380,7 +380,7 @@ BEGIN
replacedDcId, retryCounter, drainingJob
INTO varCfId, varUid, varGid, varDestDcId, varDestSvcClass, varRepType,
varReplacedDcId, varRetryCounter, varDrainingJob
FROM Disk2DiskCopyjob
FROM Disk2DiskCopyJob
WHERE transferId = inTransferId;
-- lock the castor file (and get logging info)
SELECT fileid, nsHost, fileSize INTO varFileId, varNsHost, varFileSize
......@@ -395,6 +395,7 @@ BEGIN
-- So our brand new copy has to be created as invalid to trigger GC.
varNewDcStatus := dconst.DISKCOPY_INVALID;
varLogMsg := dlf.D2D_D2DDONE_CANCEL;
varDestDcId := ids_seq.nextval;
END;
-- check the filesize
IF inReplicaFileSize != varFileSize THEN
......@@ -414,7 +415,7 @@ BEGIN
varComment := varComment || ' replicaFileSize=' || TO_CHAR(inReplicaFileSize) ||
' errorMessage=' || inErrorMessage;
END IF;
logToDLF(NULL, dlf.LVL_SYSTEM, varLogMsg, varFileId, varNsHost, 'stagerd', varComment);
logToDLF(NULL, dlf.LVL_SYSTEM, varLogMsg, varFileId, varNsHost, 'transfermanagerd', varComment);
-- if success, create new DiskCopy, restart waiting requests, cleanup and handle replicate on close
IF inErrorMessage IS NULL THEN
-- get filesystem of the diskcopy and parse diskcopy path
......
......@@ -486,6 +486,183 @@ BEGIN
END;
/
-- #103387: Incorrect clean up of Disk2diskCopyJobs when they are cancelled
/* PL/SQL method implementing disk2DiskCopyEnded
* Note that inDestDsName, inDestPath and inReplicaFileSize are not used when inErrorMessage is not NULL
* inErrorCode is used in case of error to decide whether to retry and also to invalidate
* the source diskCopy if the error is an ENOENT
*/
CREATE OR REPLACE PROCEDURE disk2DiskCopyEnded
(inTransferId IN VARCHAR2, inDestDsName IN VARCHAR2, inDestPath IN VARCHAR2,
inReplicaFileSize IN INTEGER, inErrorCode IN INTEGER, inErrorMessage IN VARCHAR2) AS
varCfId INTEGER;
varUid INTEGER := -1;
varGid INTEGER := -1;
varDestDcId INTEGER;
varDestSvcClass INTEGER;
varRepType INTEGER;
varReplacedDcId INTEGER;
varRetryCounter INTEGER;
varFileId INTEGER;
varNsHost VARCHAR2(2048);
varFileSize INTEGER;
varDestPath VARCHAR2(2048);
varDestFsId INTEGER;
varDcGcWeight NUMBER := 0;
varDcImportance NUMBER := 0;
varNewDcStatus INTEGER := dconst.DISKCOPY_VALID;
varLogMsg VARCHAR2(2048);
varComment VARCHAR2(2048);
varDrainingJob VARCHAR2(2048);
BEGIN
varLogMsg := CASE WHEN inErrorMessage IS NULL THEN dlf.D2D_D2DDONE_OK ELSE dlf.D2D_D2DFAILED END;
BEGIN
-- Get data from the disk2DiskCopy Job
SELECT castorFile, ouid, ogid, destDcId, destSvcClass, replicationType,
replacedDcId, retryCounter, drainingJob
INTO varCfId, varUid, varGid, varDestDcId, varDestSvcClass, varRepType,
varReplacedDcId, varRetryCounter, varDrainingJob
FROM Disk2DiskCopyJob
WHERE transferId = inTransferId;
-- lock the castor file (and get logging info)
SELECT fileid, nsHost, fileSize INTO varFileId, varNsHost, varFileSize
FROM CastorFile
WHERE id = varCfId
FOR UPDATE;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- two possibilities here :
-- - disk2diskCopyJob not found. It was probably canceled.
-- - the castorFile has disappeared before we locked it, ant the
-- disk2diskCopyJob too as we have a foreign key constraint.
-- So our brand new copy has to be created as invalid to trigger GC.
varNewDcStatus := dconst.DISKCOPY_INVALID;
varLogMsg := dlf.D2D_D2DDONE_CANCEL;
varDestDcId := ids_seq.nextval;
END;
-- check the filesize
IF inReplicaFileSize != varFileSize THEN
-- replication went wrong !
IF varLogMsg = dlf.D2D_D2DDONE_OK THEN
varLogMsg := dlf.D2D_D2DDONE_BADSIZE;
varNewDcStatus := dconst.DISKCOPY_INVALID;
END IF;
END IF;
-- Log success or failure of the replication
varComment := 'transferId=' || inTransferId ||
' destSvcClass=' || getSvcClassName(varDestSvcClass) ||
' dstDcId=' || TO_CHAR(varDestDcId) || ' destPath=' || inDestDsName || ':' || inDestPath ||
' euid=' || TO_CHAR(varUid) || ' egid=' || TO_CHAR(varGid) ||
' fileSize=' || TO_CHAR(varFileSize);
IF inErrorMessage IS NOT NULL THEN
varComment := varComment || ' replicaFileSize=' || TO_CHAR(inReplicaFileSize) ||
' errorMessage=' || inErrorMessage;
END IF;
logToDLF(NULL, dlf.LVL_SYSTEM, varLogMsg, varFileId, varNsHost, 'transfermanagerd', varComment);
-- if success, create new DiskCopy, restart waiting requests, cleanup and handle replicate on close
IF inErrorMessage IS NULL THEN
-- get filesystem of the diskcopy and parse diskcopy path
SELECT FileSystem.id, SUBSTR(inDestPath, LENGTH(FileSystem.mountPoint)+1)
INTO varDestFsId, varDestPath
FROM DiskServer, FileSystem
WHERE DiskServer.name = inDestDsName
AND FileSystem.diskServer = DiskServer.id
AND INSTR(inDestPath, FileSystem.mountPoint) = 1;
-- compute GcWeight and importance of the new copy
IF varNewDcStatus = dconst.DISKCOPY_VALID THEN
DECLARE
varGcwProc VARCHAR2(2048);
BEGIN
varGcwProc := castorGC.getCopyWeight(varFileSize);
EXECUTE IMMEDIATE
'BEGIN :newGcw := ' || varGcwProc || '(:size); END;'
USING OUT varDcGcWeight, IN varFileSize;
SELECT /*+ INDEX_RS_ASC (DiskCopy I_DiskCopy_CastorFile) */
COUNT(*)+1 INTO varDCImportance FROM DiskCopy
WHERE castorFile=varCfId AND status = dconst.DISKCOPY_VALID;
END;
END IF;
-- create the new DiskCopy
INSERT INTO DiskCopy (path, gcWeight, creationTime, lastAccessTime, diskCopySize, nbCopyAccesses,
owneruid, ownergid, id, gcType, fileSystem, castorFile, status, importance)
VALUES (varDestPath, varDcGcWeight, getTime(), getTime(), varFileSize, 0, varUid, varGid, varDestDcId,
CASE varNewDcStatus WHEN dconst.DISKCOPY_INVALID THEN dconst.GCTYPE_OVERWRITTEN ELSE NULL END,
varDestFsId, varCfId, varNewDcStatus, varDCImportance);
-- Wake up waiting subrequests
UPDATE SubRequest
SET status = dconst.SUBREQUEST_RESTART,
getNextStatus = CASE WHEN inErrorMessage IS NULL THEN dconst.GETNEXTSTATUS_FILESTAGED ELSE getNextStatus END,
lastModificationTime = getTime()
WHERE status = dconst.SUBREQUEST_WAITSUBREQ
AND castorfile = varCfId;
DBMS_ALERT.SIGNAL('wakeUpJobReqSvc', '');
-- delete the disk2diskCopyJob
DELETE FROM Disk2DiskCopyjob WHERE transferId = inTransferId;
-- In case of valid new copy
IF varNewDcStatus = dconst.DISKCOPY_VALID THEN
-- update importance of other DiskCopies if it's an additional one
IF varReplacedDcId IS NOT NULL THEN
UPDATE DiskCopy SET importance = varDCImportance WHERE castorFile=varCfId;
END IF;
-- drop source if requested
UPDATE DiskCopy SET status = dconst.DISKCOPY_INVALID WHERE id = varReplacedDcId;
-- Trigger the creation of additional copies of the file, if any
replicateOnClose(varCfId, varUid, varGid);
END IF;
-- In case of draining, update DrainingJob
IF varDrainingJob IS NOT NULL THEN
updateDrainingJobOnD2dEnd(varDrainingJob, varFileSize, False);
END IF;
ELSE
DECLARE
varMaxNbD2dRetries INTEGER := TO_NUMBER(getConfigOption('D2dCopy', 'MaxNbRetries', 2));
BEGIN
-- shall we try again ?
-- we should not when the job was deliberately killed, neither when we reach the maximum
-- number of attempts
IF varRetryCounter < varMaxNbD2dRetries AND inErrorCode != serrno.ESTKILLED THEN
-- yes, so let's restart the Disk2DiskCopyJob
UPDATE Disk2DiskCopyJob
SET status = dconst.DISK2DISKCOPYJOB_PENDING,
retryCounter = varRetryCounter + 1
WHERE transferId = inTransferId;
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.D2D_D2DDONE_RETRIED, varFileId, varNsHost, 'stagerd', varComment ||
' RetryNb=' || TO_CHAR(varRetryCounter+1) || ' maxNbRetries=' || TO_CHAR(varMaxNbD2dRetries));
ELSE
-- no retry, let's delete the disk to disk job copy
BEGIN
DELETE FROM Disk2DiskCopyjob WHERE transferId = inTransferId;
-- and remember the error in case of draining
IF varDrainingJob IS NOT NULL THEN
INSERT INTO DrainingErrors (drainingJob, errorMsg, fileId, nsHost)
VALUES (varDrainingJob, inErrorMessage, varFileId, varNsHost);
END IF;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- the Disk2DiskCopyjob was already dropped (e.g. because of an interrupted draining)
-- in such a case, forget about the error
NULL;
END;
logToDLF(NULL, dlf.LVL_NOTICE, dlf.D2D_D2DDONE_NORETRY, varFileId, varNsHost, 'stagerd', varComment ||
' maxNbRetries=' || TO_CHAR(varMaxNbD2dRetries));
-- Fail waiting subrequests
UPDATE SubRequest
SET status = dconst.SUBREQUEST_FAILED,
lastModificationTime = getTime(),
errorCode = serrno.SEINTERNAL,
errorMessage = 'Disk to disk copy failed after ' || TO_CHAR(varMaxNbD2dRetries) ||
'retries. Last error was : ' || inErrorMessage
WHERE status = dconst.SUBREQUEST_WAITSUBREQ
AND castorfile = varCfId;
-- In case of draining, update DrainingJob
IF varDrainingJob IS NOT NULL THEN
updateDrainingJobOnD2dEnd(varDrainingJob, varFileSize, True);
END IF;
END IF;
END;
END IF;
END;
/
/* Recompile all procedures, triggers and functions */
BEGIN
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment