Commit 4b21dfb3 authored by Sebastien Ponce's avatar Sebastien Ponce
Browse files

Merged v2_1_14Version to master

parents 86b25d98 f1a6d684
......@@ -210,21 +210,6 @@
# The value is given in seconds. Default is 5
#TransferManager AdminTimeout 5
# The following option defines how long in seconds a job is allowed to remain in a
# queue waiting for resources before being killed by the transfer manager. For
# convenience, a svcclass name of "all" can be used to define a default value for
# all service classes.
#
# The format of the value is:
# <svcclass1>:<timeout1> [svcclass2:timeout2[...]]
#TransferManager PendingTimeouts all:120 default:120
# Defines how long in seconds that a disk2disk copy job can be in a pending state
# before being terminated. This value is global for all service classes.
#TransferManager DiskCopyPendingTimeout 7200
# The SynchronizationInterval option defines how often the transfer managers check
# that jobs pending for more than 1h in the DB are still handled by the scheduling
# system. This allows to clean up inconsistencies created by double or severe failures,
......@@ -233,14 +218,12 @@
# It also checks for sources of disk to disk copy that may have been left behind due
# to timeouts in internal messages between destination on source machine
# The default value is 300, that is 5mn
#TransferManager SynchronizationInterval 300
# Number of requests read from a given connection in one go before looking at other
# connections in a given processing thread of the TransferManager. A big value will
# improve the overall performance by limiting the number of switches (and the inheritent
# queue locks) but will reduce the fairness between connections. Default is set to 10
#TransferManager RequestBatchSize 10
# Interval between two updates of the stager database with hearbeats received from
......@@ -248,7 +231,6 @@
# nodes are disabled if their heartbeat is too old (see DiskServer/HeartBeatTimeout
# config parameter in the stager database)
# Default is 1.0
#TransferManager HeartBeatDBUpdateInterval 1.0
# maximum number of slots on this node. Note that these slots are virtual and that
......@@ -270,6 +252,19 @@
#DiskManager recallWeight 1
#DiskManager migrWeight 1
# The following option defines how long in seconds a job is allowed to remain in a
# queue waiting for resources before being killed by the transfer manager. For
# convenience, a svcclass name of "all" can be used to define a default value for
# all service classes.
#
# The format of the value is:
# <svcclass1>:<timeout1> [svcclass2:timeout2[...]]
#DiskManager PendingTimeouts all:120 default:120
# Defines how long in seconds that a disk2disk copy job can be in a pending state
# before being terminated. This value is global for all service classes.
#DiskManager DiskCopyPendingTimeout 7200
# Number of free slots for which the absence of scheduling is something is in the queue
# should be considered abnormal. See ActivityControlChecker thread in the diskmanager
# daemon for more details. This should usually be put to the maximum Weight given in the
......@@ -280,14 +275,12 @@
# of the disk to disk copy when source is not ready
#DiskManager MaxRetryInterval 300
# Interval between two heartbeats send to the transfer manager, expressed in seconds.
# Default is 1.0
# Interval between two heartbeats send to the transfer manager, expressed in seconds
#DiskManager HeartbeatInterval 1.0
# Interval between two logs of heartbeat not sent errors, expressed in seconds.
# Other are logged only in debug level in order to not flood the logs when
# the transfermanagers are all down.
# Default is 300.0
#DiskManager HeartbeatNotSentLogInterval 300.0
# Number of user-requested jobs to be scheduled before a backfill job (e.g. internally
......@@ -308,31 +301,26 @@
# The interval between two checks of the GC daemon to see whether there are
# files to be removed from a diskserver. This value is represented in seconds.
#GC Interval 300
# The ChunkInterval is the interval in seconds between synchronization queries to
# the stager catalog and nameserver, i.e. the interval between two bulk checks of
# size ChunkSize. To disable all synchronization checks set this value to 0.
#GC ChunkInterval 1800
# The ChunkSize defines the number of files that the GC daemon should synchronize
# with the stager catalog and nameserver in one go. Note: the largest value is 3000.
#GC ChunkSize 2000
# By default the startup of the GC daemon is deliberately offset by a random interval
# between 1 and 15 minutes. This randomized delay should prevent all GC's in an
# castor2 instance from deleting files at the same time causing an oscillation in
# incoming network traffic due to deletions. By uncommenting this line, you force the
# incoming network traffic due to deletions. By setting this value to yes, you force the
# garbage collection to ignore this starting delay.
#GC ImmediateStart yes
#GC ImmediateStart no
# This option allows to disable the synchronization between the diskservers and the
# stager catalog. The synchronization with the nameserver is not affected.
#GC DisableStagerSync no
......
......@@ -877,7 +877,7 @@ END;
/
ALTER TABLE DiskCopy
ADD CONSTRAINT CK_DiskCopy_Status
CHECK (status IN (0, 4, 5, 6, 7, 9, 10, 11));
CHECK (status IN (0, 4, 5, 6, 7, 9, 11));
ALTER TABLE DiskCopy
ADD CONSTRAINT CK_DiskCopy_GcType
CHECK (gcType IN (0, 1, 2, 3, 4, 5, 6, 7));
......@@ -1174,6 +1174,7 @@ CREATE TABLE DrainingJob
egid INTEGER CONSTRAINT NN_DrainingJob_Egid NOT NULL,
pid INTEGER CONSTRAINT NN_DrainingJob_Pid NOT NULL,
machine VARCHAR2(2048) CONSTRAINT NN_DrainingJob_Machine NOT NULL,
reqId VARCHAR2(2048) CONSTRAINT NN_DrainingJob_ReqId NOT NULL,
creationTime INTEGER CONSTRAINT NN_DrainingJob_CT NOT NULL,
lastModificationTime INTEGER CONSTRAINT NN_DrainingJob_LMT NOT NULL,
status INTEGER CONSTRAINT NN_DrainingJob_Status NOT NULL,
......@@ -1228,16 +1229,25 @@ CREATE TABLE DrainingErrors
(drainingJob INTEGER CONSTRAINT NN_DrainingErrors_DJ NOT NULL,
errorMsg VARCHAR2(2048) CONSTRAINT NN_DrainingErrors_ErrorMsg NOT NULL,
fileId INTEGER CONSTRAINT NN_DrainingErrors_FileId NOT NULL,
nsHost VARCHAR2(2048) CONSTRAINT NN_DrainingErrors_NsHost NOT NULL)
nsHost VARCHAR2(2048) CONSTRAINT NN_DrainingErrors_NsHost NOT NULL,
diskCopy INTEGER,
timeStamp NUMBER CONSTRAINT NN_DrainingErrors_TimeStamp NOT NULL)
ENABLE ROW MOVEMENT;
CREATE INDEX I_DrainingErrors_DJ ON DrainingErrors (drainingJob);
CREATE INDEX I_DrainingErrors_DC ON DrainingErrors (diskCopy);
ALTER TABLE DrainingErrors
ADD CONSTRAINT FK_DrainingErrors_DJ
FOREIGN KEY (drainingJob)
REFERENCES DrainingJob (id);
ALTER TABLE DrainingErrors
ADD CONSTRAINT FK_DrainingErrors_DC
FOREIGN KEY (diskCopy)
REFERENCES DiskCopy (id);
/* Definition of the Disk2DiskCopyJob table. Each line is a disk2diskCopy job to process
* id : unique DB identifier for this job
* transferId : unique identifier for the transfer associated to this job
......@@ -1270,6 +1280,7 @@ CREATE TABLE Disk2DiskCopyJob
replicationType INTEGER CONSTRAINT NN_Disk2DiskCopyJob_Type NOT NULL,
replacedDcId INTEGER,
destDcId INTEGER CONSTRAINT NN_Disk2DiskCopyJob_DCId NOT NULL,
srcDcId INTEGER,
drainingJob INTEGER)
INITRANS 50 PCTFREE 50 ENABLE ROW MOVEMENT;
CREATE INDEX I_Disk2DiskCopyJob_Tid ON Disk2DiskCopyJob(transferId);
......
......@@ -391,6 +391,21 @@ EXCEPTION WHEN OTHERS THEN
END;
/
/* A wrapper procedure to execute alertSignalNoLock() without taking a lock if
* already another session did it. Helps reducing contention on DBMS_ALERT_INFO.
*/
CREATE OR REPLACE PROCEDURE alertSignalNoLock(inName IN VARCHAR2) AS
unused INTEGER;
BEGIN
SELECT 1 INTO unused
FROM SYS.DBMS_ALERT_INFO
WHERE name = upper(inName) AND changed = 'Y'
AND ROWNUM < 2;
EXCEPTION WHEN NO_DATA_FOUND THEN
DBMS_ALERT.SIGNAL(inName, '');
END;
/
/* useful procedure to recompile all invalid items in the DB
as many times as needed, until nothing can be imprved anymore.
Also reports the list of invalid items if any */
......
......@@ -20,8 +20,6 @@ END;
/* handle the creation of the Disk2DiskCopyJobs for the running drainingJobs */
CREATE OR REPLACE PROCEDURE drainRunner AS
varNbFiles INTEGER;
varNbBytes INTEGER;
varNbRunningJobs INTEGER;
varMaxNbOfSchedD2dPerDrain INTEGER;
varUnused INTEGER;
......@@ -44,8 +42,6 @@ BEGIN
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.DRAINING_REFILL, 0, '', 'stagerd',
'svcClass=' || getSvcClassName(dj.svcClass) || ' DrainReq=' ||
TO_CHAR(dj.id) || ' MaxNewJobsCount=' || TO_CHAR(varMaxNbOfSchedD2dPerDrain-varNbRunningJobs));
varNbFiles := 0;
varNbBytes := 0;
FOR F IN (SELECT * FROM
(SELECT CastorFile.id cfId, Castorfile.nsOpenTime, DiskCopy.id dcId, CastorFile.fileSize
FROM DiskCopy, CastorFile
......@@ -55,19 +51,15 @@ BEGIN
CastorFile.tapeStatus IN (dconst.CASTORFILE_NOTONTAPE, dconst.CASTORFILE_DISKONLY)) OR
(dj.fileMask = dconst.DRAIN_FILEMASK_ALL))
AND DiskCopy.status = dconst.DISKCOPY_VALID
AND NOT EXISTS (SELECT 1 FROM Disk2DiskCopyJob WHERE castorFile=CastorFile.id)
AND NOT EXISTS (SELECT 1 FROM Disk2DiskCopyJob WHERE castorFile = CastorFile.id)
AND NOT EXISTS (SELECT 1 FROM DrainingErrors WHERE diskCopy = DiskCopy.id)
ORDER BY DiskCopy.importance DESC)
WHERE ROWNUM <= varMaxNbOfSchedD2dPerDrain-varNbRunningJobs) LOOP
createDisk2DiskCopyJob(F.cfId, F.nsOpenTime, dj.svcClass, dj.euid, dj.egid,
dconst.REPLICATIONTYPE_DRAINING, F.dcId, dj.id, FALSE);
varNbFiles := varNbFiles + 1;
varNbBytes := varNbBytes + F.fileSize;
END LOOP;
-- commit and update counters
UPDATE DrainingJob
SET totalFiles = totalFiles + varNbFiles,
totalBytes = totalBytes + varNbBytes,
lastModificationTime = getTime()
SET lastModificationTime = getTime()
WHERE id = dj.id;
COMMIT;
EXCEPTION WHEN CONSTRAINT_VIOLATED THEN
......@@ -99,6 +91,7 @@ BEGIN
FOR dj IN (SELECT id, fileSystem FROM DrainingJob WHERE status = dconst.DRAININGJOB_SUBMITTED) LOOP
UPDATE DrainingJob SET status = dconst.DRAININGJOB_STARTING WHERE id = dj.id;
COMMIT;
-- Compute totals now. Jobs will be later added in bunches by drainRunner
SELECT count(*), SUM(diskCopySize) INTO varTFiles, varTBytes
FROM DiskCopy
WHERE fileSystem = dj.fileSystem
......@@ -142,10 +135,13 @@ BEGIN
LOOP
-- Fetch next candidate
FETCH DCcur INTO varDcId, varDcSize, varCfId, varNsOpenTime;
-- no next candidate : this is surprising, but nevertheless, we should go out of the loop
IF DCcur%NOTFOUND THEN EXIT; END IF;
-- stop if it would be too much
IF varTotalRebalanced + varDcSize > inDataAmount THEN EXIT; END IF;
-- compute new totals
varTotalRebalanced := varTotalRebalanced + varDcSize;
varNbFilesRebalanced := varNbFilesRebalanced + 1;
-- stop if it would be too much
IF varTotalRebalanced > inDataAmount THEN EXIT; END IF;
-- create disk2DiskCopyJob for this diskCopy
createDisk2DiskCopyJob(varCfId, varNsOpenTime, inDestSvcClassId,
0, 0, dconst.REPLICATIONTYPE_REBALANCE,
......@@ -242,7 +238,7 @@ BEGIN
END LOOP;
-- Create the drain manager job to be executed every minute. This one starts and clean up draining jobs
-- Create the drain manager job to be executed every minute. This one starts and cleans up draining jobs
DBMS_SCHEDULER.CREATE_JOB(
JOB_NAME => 'drainManagerJob',
JOB_TYPE => 'PLSQL_BLOCK',
......
......@@ -378,7 +378,7 @@ BEGIN
Castorfile.fileid, Castorfile.nshost,
DiskCopy.lastAccessTime, DiskCopy.nbCopyAccesses, DiskCopy.gcWeight,
getObjStatusName('DiskCopy', 'gcType', DiskCopy.gcType),
getSvcClassList(FileSystem.id)
getSvcClassList(FileSystem.id)
FROM CastorFile, DiskCopy, FileSystem, DiskServer
WHERE decode(DiskCopy.status, 9, DiskCopy.status, NULL) = 9 -- BEINGDELETED
AND DiskCopy.castorfile = CastorFile.id
......@@ -480,9 +480,9 @@ BEGIN
-- file from the nameserver. For safety, we thus keep it
NULL;
WHEN CONSTRAINT_VIOLATED THEN
IF sqlerrm LIKE '%constraint (CASTOR_STAGER.FK_DISK2DISKCOPYJOB_CASTORFILE) violated%' THEN
-- Ignore the deletion, probably some draining/rebalancing activity created a Disk2DiskCopyJob entity
-- while we were attempting to drop the CastorFile
IF sqlerrm LIKE '%constraint (CASTOR_STAGER.FK_%_CASTORFILE) violated%' THEN
-- Ignore the deletion, probably some draining/rebalancing/recall activity created
-- a new Disk2DiskCopyJob/RecallJob entity while we were attempting to drop the CastorFile
NULL;
ELSE
-- Any other constraint violation is an error
......
......@@ -160,7 +160,7 @@ BEGIN
SET status = dconst.SUBREQUEST_RESTART, lastModificationTime = getTime()
WHERE status = dconst.SUBREQUEST_WAITSUBREQ
AND castorFile = varCfId;
DBMS_ALERT.SIGNAL('wakeUpJobReqSvc', '');
alertSignalNoLock('wakeUpJobReqSvc');
-- link DiskCopy and FileSystem and update DiskCopyStatus
UPDATE DiskCopy
SET status = 6, -- DISKCOPY_STAGEOUT
......@@ -336,7 +336,6 @@ BEGIN
-- update DrainingJob
UPDATE DrainingJob
SET status = varStatus,
totalFiles = varTotalFiles,
nbFailedBytes = varNbFailedBytes,
nbSuccessBytes = varNbSuccessBytes,
nbFailedFiles = varNbFailedFiles,
......@@ -357,6 +356,7 @@ CREATE OR REPLACE PROCEDURE disk2DiskCopyEnded
varUid INTEGER := -1;
varGid INTEGER := -1;
varDestDcId INTEGER;
varSrcDcId INTEGER;
varDestSvcClass INTEGER;
varRepType INTEGER;
varReplacedDcId INTEGER;
......@@ -373,30 +373,48 @@ CREATE OR REPLACE PROCEDURE disk2DiskCopyEnded
varComment VARCHAR2(2048);
varDrainingJob VARCHAR2(2048);
BEGIN
varLogMsg := CASE WHEN inErrorMessage IS NULL THEN dlf.D2D_D2DDONE_OK ELSE dlf.D2D_D2DFAILED END;
BEGIN
-- Get data from the disk2DiskCopy Job
SELECT castorFile, ouid, ogid, destDcId, destSvcClass, replicationType,
IF inDestPath != '' THEN
-- Parse destination path
parsePath(inDestDsName ||':'|| inDestPath, varDestFsId, varDestPath, varDestDcId, varFileId, varNsHost);
-- ELSE we are called because of an error at start: try to gather information
-- from the Disk2DiskCopyJob entry or and fail accordingly.
END IF;
-- Get data from the Disk2DiskCopyJob
SELECT castorFile, ouid, ogid, destDcId, srcDcId, destSvcClass, replicationType,
replacedDcId, retryCounter, drainingJob
INTO varCfId, varUid, varGid, varDestDcId, varDestSvcClass, varRepType,
INTO varCfId, varUid, varGid, varDestDcId, varSrcDcId, varDestSvcClass, varRepType,
varReplacedDcId, varRetryCounter, varDrainingJob
FROM Disk2DiskCopyJob
WHERE transferId = inTransferId;
-- lock the castor file (and get logging info)
SELECT fileid, nsHost, fileSize INTO varFileId, varNsHost, varFileSize
FROM CastorFile
WHERE id = varCfId
FOR UPDATE;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- two possibilities here :
-- - disk2diskCopyJob not found. It was probably canceled.
-- - the castorFile has disappeared before we locked it, ant the
-- disk2diskCopyJob too as we have a foreign key constraint.
-- So our brand new copy has to be created as invalid to trigger GC.
-- The job was probably canceled: so our brand new copy
-- has to be created as invalid to trigger GC, and linked
-- to the (hopefully existing) correct CastorFile.
varNewDcStatus := dconst.DISKCOPY_INVALID;
varLogMsg := dlf.D2D_D2DDONE_CANCEL;
varDestDcId := ids_seq.nextval;
BEGIN
SELECT id INTO varCfId
FROM CastorFile
WHERE fileId = varFileId;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- Here we also lost the CastorFile: this could happen
-- if the GC ran meanwhile. Fail and leave dark data behind,
-- the GC will eventually catch up. A full solution would be
-- to gather here all missing information to correctly
-- recreate the CastorFile entry, but this is too complex
-- for what we would gain.
logToDLF(NULL, dlf.LVL_NOTICE, dlf.D2D_D2DDONE_CANCEL, varFileId, varNsHost, 'transfermanagerd',
'transferId=' || inTransferId || ' errorMessage="CastorFile disappeared, giving up"');
RETURN;
END;
END;
varLogMsg := CASE WHEN inErrorMessage IS NULL THEN dlf.D2D_D2DDONE_OK ELSE dlf.D2D_D2DFAILED END;
-- lock the castor file (and get logging info)
SELECT fileid, nsHost, fileSize INTO varFileId, varNsHost, varFileSize
FROM CastorFile
WHERE id = varCfId
FOR UPDATE;
-- check the filesize
IF inReplicaFileSize != varFileSize THEN
-- replication went wrong !
......@@ -406,10 +424,10 @@ BEGIN
END IF;
END IF;
-- Log success or failure of the replication
varComment := 'transferId=' || inTransferId ||
' destSvcClass=' || getSvcClassName(varDestSvcClass) ||
' dstDcId=' || TO_CHAR(varDestDcId) || ' destPath=' || inDestDsName || ':' || inDestPath ||
' euid=' || TO_CHAR(varUid) || ' egid=' || TO_CHAR(varGid) ||
varComment := 'transferId="' || inTransferId ||
'" destSvcClass=' || getSvcClassName(varDestSvcClass) ||
' dstDcId=' || TO_CHAR(varDestDcId) || ' destPath="' || inDestPath ||
'" euid=' || TO_CHAR(varUid) || ' egid=' || TO_CHAR(varGid) ||
' fileSize=' || TO_CHAR(varFileSize);
IF inErrorMessage IS NOT NULL THEN
varComment := varComment || ' replicaFileSize=' || TO_CHAR(inReplicaFileSize) ||
......@@ -418,13 +436,6 @@ BEGIN
logToDLF(NULL, dlf.LVL_SYSTEM, varLogMsg, varFileId, varNsHost, 'transfermanagerd', varComment);
-- if success, create new DiskCopy, restart waiting requests, cleanup and handle replicate on close
IF inErrorMessage IS NULL THEN
-- get filesystem of the diskcopy and parse diskcopy path
SELECT FileSystem.id, SUBSTR(inDestPath, LENGTH(FileSystem.mountPoint)+1)
INTO varDestFsId, varDestPath
FROM DiskServer, FileSystem
WHERE DiskServer.name = inDestDsName
AND FileSystem.diskServer = DiskServer.id
AND INSTR(inDestPath, FileSystem.mountPoint) = 1;
-- compute GcWeight and importance of the new copy
IF varNewDcStatus = dconst.DISKCOPY_VALID THEN
DECLARE
......@@ -452,7 +463,7 @@ BEGIN
lastModificationTime = getTime()
WHERE status = dconst.SUBREQUEST_WAITSUBREQ
AND castorfile = varCfId;
DBMS_ALERT.SIGNAL('wakeUpJobReqSvc', '');
alertSignalNoLock('wakeUpJobReqSvc');
-- delete the disk2diskCopyJob
DELETE FROM Disk2DiskCopyjob WHERE transferId = inTransferId;
-- In case of valid new copy
......@@ -471,6 +482,7 @@ BEGIN
updateDrainingJobOnD2dEnd(varDrainingJob, varFileSize, False);
END IF;
ELSE
-- failure
DECLARE
varMaxNbD2dRetries INTEGER := TO_NUMBER(getConfigOption('D2dCopy', 'MaxNbRetries', 2));
BEGIN
......@@ -483,7 +495,7 @@ BEGIN
SET status = dconst.DISK2DISKCOPYJOB_PENDING,
retryCounter = varRetryCounter + 1
WHERE transferId = inTransferId;
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.D2D_D2DDONE_RETRIED, varFileId, varNsHost, 'stagerd', varComment ||
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.D2D_D2DDONE_RETRIED, varFileId, varNsHost, 'transfermanagerd', varComment ||
' RetryNb=' || TO_CHAR(varRetryCounter+1) || ' maxNbRetries=' || TO_CHAR(varMaxNbD2dRetries));
ELSE
-- no retry, let's delete the disk to disk job copy
......@@ -491,15 +503,15 @@ BEGIN
DELETE FROM Disk2DiskCopyjob WHERE transferId = inTransferId;
-- and remember the error in case of draining
IF varDrainingJob IS NOT NULL THEN
INSERT INTO DrainingErrors (drainingJob, errorMsg, fileId, nsHost)
VALUES (varDrainingJob, inErrorMessage, varFileId, varNsHost);
INSERT INTO DrainingErrors (drainingJob, errorMsg, fileId, nsHost, diskCopy, timeStamp)
VALUES (varDrainingJob, inErrorMessage, varFileId, varNsHost, varSrcDcId, getTime());
END IF;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- the Disk2DiskCopyjob was already dropped (e.g. because of an interrupted draining)
-- in such a case, forget about the error
NULL;
END;
logToDLF(NULL, dlf.LVL_NOTICE, dlf.D2D_D2DDONE_NORETRY, varFileId, varNsHost, 'stagerd', varComment ||
logToDLF(NULL, dlf.LVL_NOTICE, dlf.D2D_D2DDONE_NORETRY, varFileId, varNsHost, 'transfermanagerd', varComment ||
' maxNbRetries=' || TO_CHAR(varMaxNbD2dRetries));
-- Fail waiting subrequests
UPDATE SubRequest
......@@ -546,10 +558,11 @@ BEGIN
SELECT castorFile, destDcId INTO varCfId, varDestDcId
FROM Disk2DiskCopyJob
WHERE transferId = inTransferId
AND status = dconst.DISK2DISKCOPYJOB_SCHEDULED;
AND status = dconst.DISK2DISKCOPYJOB_SCHEDULED
FOR UPDATE;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- log "disk2DiskCopyStart : Replication request canceled while queuing in scheduler or transfer already started"
logToDLF(NULL, dlf.LVL_USER_ERROR, dlf.D2D_CANCELED_AT_START, inFileId, inNsHost, 'stagerd',
logToDLF(NULL, dlf.LVL_USER_ERROR, dlf.D2D_CANCELED_AT_START, inFileId, inNsHost, 'transfermanagerd',
'TransferId=' || TO_CHAR(inTransferId) || ' destDiskServer=' || inDestDiskServerName ||
' destMountPoint=' || inDestMountPoint || ' srcDiskServer=' || inSrcDiskServerName ||
' srcMountPoint=' || inSrcMountPoint);
......@@ -571,7 +584,7 @@ BEGIN
AND DiskCopy.castorFile = varCfId;
EXCEPTION WHEN NO_DATA_FOUND THEN
-- log "disk2DiskCopyStart : Source has disappeared while queuing in scheduler, retrying"
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.D2D_SOURCE_GONE, inFileId, inNsHost, 'stagerd',
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.D2D_SOURCE_GONE, inFileId, inNsHost, 'transfermanagerd',
'TransferId=' || TO_CHAR(inTransferId) || ' destDiskServer=' || inDestDiskServerName ||
' destMountPoint=' || inDestMountPoint || ' srcDiskServer=' || inSrcDiskServerName ||
' srcMountPoint=' || inSrcMountPoint);
......@@ -581,10 +594,18 @@ BEGIN
-- raise exception for the scheduling part
raise_application_error(-20110, dlf.D2D_SOURCE_GONE);
END;
-- at this point we can update the Disk2DiskCopyJob with the source. This may be used
-- by disk2DiskCopyEnded to track the failed sources.
UPDATE Disk2DiskCopyJob
SET status = dconst.DISK2DISKCOPYJOB_RUNNING,
srcDcId = varSrcDcId
WHERE transferId = inTransferId;
IF (varSrcDsStatus = dconst.DISKSERVER_DISABLED OR varSrcFsStatus = dconst.FILESYSTEM_DISABLED
OR varSrcHwOnline = 0) THEN
-- log "disk2DiskCopyStart : Source diskserver/filesystem was DISABLED meanwhile"
logToDLF(NULL, dlf.LVL_WARNING, dlf.D2D_SRC_DISABLED, inFileId, inNsHost, 'stagerd',
logToDLF(NULL, dlf.LVL_WARNING, dlf.D2D_SRC_DISABLED, inFileId, inNsHost, 'transfermanagerd',
'TransferId=' || TO_CHAR(inTransferId) || ' diskServer=' || inSrcDiskServerName ||
' fileSystem=' || inSrcMountPoint);
-- fail d2d transfer
......@@ -604,7 +625,7 @@ BEGIN
IF (varDestDsStatus != dconst.DISKSERVER_PRODUCTION OR varDestFsStatus != dconst.FILESYSTEM_PRODUCTION
OR varDestHwOnline = 0) THEN
-- log "disk2DiskCopyStart : Destination diskserver/filesystem not in PRODUCTION any longer"
logToDLF(NULL, dlf.LVL_WARNING, dlf.D2D_DEST_NOT_PRODUCTION, inFileId, inNsHost, 'stagerd',
logToDLF(NULL, dlf.LVL_WARNING, dlf.D2D_DEST_NOT_PRODUCTION, inFileId, inNsHost, 'transfermanagerd',
'TransferId=' || TO_CHAR(inTransferId) || ' diskServer=' || inDestDiskServerName);
-- fail d2d transfer
disk2DiskCopyEnded(inTransferId, '', '', 0, 0, 'Destination not in production');
......@@ -622,7 +643,7 @@ BEGIN
AND DiskCopy.status = dconst.DISKCOPY_VALID;
IF varNbCopies > 0 THEN
-- log "disk2DiskCopyStart : Multiple copies of this file already found on this diskserver"
logToDLF(NULL, dlf.LVL_ERROR, dlf.D2D_MULTIPLE_COPIES_ON_DS, inFileId, inNsHost, 'stagerd',
logToDLF(NULL, dlf.LVL_ERROR, dlf.D2D_MULTIPLE_COPIES_ON_DS, inFileId, inNsHost, 'transfermanagerd',
'TransferId=' || TO_CHAR(inTransferId) || ' diskServer=' || inDestDiskServerName);
-- fail d2d transfer
disk2DiskCopyEnded(inTransferId, '', '', 0, 0, 'Copy found on diskserver');
......@@ -631,11 +652,6 @@ BEGIN
raise_application_error(-20110, dlf.D2D_MULTIPLE_COPIES_ON_DS);
END IF;
-- update the Disk2DiskCopyJob status and filesystem
UPDATE Disk2DiskCopyJob
SET status = dconst.DISK2DISKCOPYJOB_RUNNING
WHERE transferId = inTransferId;
-- build full path of destination copy
buildPathFromFileId(inFileId, inNsHost, varDestDcId, outDestDcPath);
outDestDcPath := inDestMountPoint || outDestDcPath;
......@@ -645,7 +661,7 @@ BEGIN
outSrcDcPath := inSrcDiskServerName || ':' || inSrcMountPoint || outSrcDcPath;
-- log "disk2DiskCopyStart called and returned successfully"
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.D2D_START_OK, inFileId, inNsHost, 'stagerd',
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.D2D_START_OK, inFileId, inNsHost, 'transfermanagerd',
'TransferId=' || TO_CHAR(inTransferId) || ' srcPath=' || outSrcDcPath ||
' destPath=' || outDestDcPath);
END;
......@@ -738,7 +754,7 @@ BEGIN
AND castorFile = cfId
AND status = dconst.SUBREQUEST_WAITSUBREQ;
-- and wake up the stager for processing it
DBMS_ALERT.SIGNAL('wakeUpStageReqSvc', '');
alertSignalNoLock('wakeUpStageReqSvc');
END IF;
-- Archive Subrequest
archiveSubReq(srId, 8); -- FINISHED
......@@ -818,7 +834,7 @@ BEGIN
WHERE castorFile = cfId
AND reqType = 39 -- PutDone
AND SubRequest.status = dconst.SUBREQUEST_WAITSUBREQ;
DBMS_ALERT.SIGNAL('wakeUpStageReqSvc', '');
alertSignalNoLock('wakeUpStageReqSvc');
EXCEPTION WHEN NO_DATA_FOUND THEN
-- This means we are a standalone put
-- thus cleanup DiskCopy and maybe the CastorFile
......@@ -950,7 +966,7 @@ BEGIN
-- try disk2diskCopyJob
SELECT id into srId FROM Disk2diskCopyJob WHERE transferId = subReqIds(i);
EXCEPTION WHEN NO_DATA_FOUND THEN
CONTINUE; -- The SubRequest/disk2DiskCopyJob may have be removed, nothing to be done.
CONTINUE; -- The SubRequest/disk2DiskCopyJob may have been removed, nothing to be done.
END;
disk2DiskCopyEnded(subReqIds(i), '', '', 0, errnos(i), errmsgs(i));
END;
......@@ -1004,14 +1020,14 @@ END;
CREATE OR REPLACE TRIGGER tr_SubRequest_informSchedReady AFTER UPDATE OF status ON SubRequest
FOR EACH ROW WHEN (new.status = 13) -- SUBREQUEST_READYFORSCHED
BEGIN
DBMS_ALERT.SIGNAL('transferReadyToSchedule', '');
alertSignalNoLock('transferReadyToSchedule');
END;
/
CREATE OR REPLACE TRIGGER tr_SubRequest_informError AFTER UPDATE OF status ON SubRequest
FOR EACH ROW WHEN (new.status = 7) -- SUBREQUEST_FAILED
BEGIN
DBMS_ALERT.SIGNAL('wakeUpErrorSvc', '');
alertSignalNoLock('wakeUpErrorSvc');
END;
/
......
......@@ -93,7 +93,7 @@ BEGIN
-- insert a row into newRequests table to trigger the processing of the request
INSERT INTO newRequests (id, type, creation) VALUES (reqId, reqType, to_date('01011970','ddmmyyyy') + 1/24/60/60 * creationTime);
-- send an alert to accelerate the processing of the request
DBMS_ALERT.SIGNAL('wakeUpJobSvc', '');
alertSignalNoLock('wakeUpJobSvc');
END;
/
......@@ -188,15 +188,15 @@ BEGIN
WHEN inReqType = 35 OR -- StageGetRequest
inReqType = 40 OR -- StagePutRequest
inReqType = 44 THEN -- StageUpdateRequest
DBMS_ALERT.SIGNAL('wakeUpJobReqSvc', '');
alertSignalNoLock('wakeUpJobReqSvc');
WHEN inReqType = 36 OR -- StagePrepareToGetRequest
inReqType = 37 OR -- StagePrepareToPutRequest
inReqType = 38 THEN -- StagePrepareToUpdateRequest
DBMS_ALERT.SIGNAL('wakeUpPrepReqSvc', '');
alertSignalNoLock('wakeUpPrepReqSvc');
WHEN inReqType = 42 OR -- StageRmRequest
inReqType = 39 OR -- StagePutDoneRequest
inReqType = 95 THEN -- SetFileGCWeight
DBMS_ALERT.SIGNAL('wakeUpStageReqSvc', '');
alertSignalNoLock('wakeUpStageReqSvc');
END CASE;
END;
/
......@@ -251,7 +251,7 @@ BEGIN
-- insert a row into newRequests table to trigger the processing of the request
INSERT INTO newRequests (id, type, creation) VALUES (reqId, reqType, to_date('01011970','ddmmyyyy') + 1/24/60/60 * creationTime);
-- send an alert to accelerate the processing of the request
DBMS_ALERT.SIGNAL('wakeUpJobSvc', '');
alertSignalNoLock('wakeUpJobSvc');
END;
/
......@@ -295,7 +295,7 @@ BEGIN
-- insert a row into newRequests table to trigger the processing of the request
INSERT INTO newRequests (id, type, creation) VALUES (reqId, reqType, to_date('01011970','ddmmyyyy') + 1/24/60/60 * creationTime);
-- send an alert to accelerate the processing of the request
DBMS_ALERT.SIGNAL('wakeUpQueryReqSvc', '');
alertSignalNoLock('wakeUpQueryReqSvc');
END;
/
......@@ -350,7 +350,7 @@ BEGIN
-- insert a row into newRequests table to trigger the processing of the request
INSERT INTO newRequests (id, type, creation) VALUES (reqId, reqType, to_date('01011970','ddmmyyyy') + 1/24/60/60 * creationTime);
-- send an alert to accelerate the processing of the request
DBMS_ALERT.SIGNAL('wakeUpQueryReqSvc', '');
alertSignalNoLock('wakeUpQueryReqSvc');
END;
/