Commit da1fbc3f authored by Eric Cano's avatar Eric Cano
Browse files

Catching up with master branch.

parents af62743d 88477545
This diff is collapsed.
......@@ -8,17 +8,128 @@
This is a minor release bringing a number of bug fixes on top of the previous
one.
CASTOR Stager
-------------
[Bug]
#93295 Binding error in call to 'INSERTSTAGEFILEQUERYREQUEST' (ORA-06550, PLS-00306)
#103117 diskserver with no filesystem make strange output in printdiskserver
#103118 fail immediately on WARNINGS in draindiskserver
#103119 in transfermanager synchronization with the DB, do not cancel too much in one go
#103144 Bad constraint for status of RecallJobs
#103158 Bad DiskCopy status in createEmptyFile
#103162 Race condition between drainRunner and deleteDrainingJob
#103165 "deadlock detected" around CASTOR_STAGER.DROPREUSEDLASTKNOWNFILENAME
#103184 Missing lock on DrainingJob in drainManager leads to dead lock
#103188 catch NO_DATA_FOUND in getFile/SvcClassName
#103189 Transfermanager synchronizer thread is not thread-safe
#103190 Constraint violation errors thrown by the GC
#103235 Stuck files after synchronization removes a file during recall
#103242 Incorrect evaluation of lastModificationTime leads to double recalls + dark data
[Features]
#103106 RFE: mark rebalancing transfers as such in listtransfer
#103196 RFE: improve usage of DBMS_ALERT in the database
#103243 RFE: drop GC/syncInterval option as it is redundant
CASTOR NS
---------
[Bug]
#103170 fix log levels in nameserver logs
[Features]
#103168 RFE: Improve logging of failures in setOrReplaceSegmentsForFiles
CASTOR Protocols
----------------
[Bug]
#92559 rfiod does not close
#103021 Xrootd castor plugin around xcastor::XrdxCastorClient::SendAsyncRequest
#103001 XRootD plugin authorization deadlock
CASTOR Tape
-----------
[Bug]
#103026 "showqueues" segfaults unless VDQM_HOST is set
#103170 fix log levels in nameserver logs
#103175 Concurrent recalls that fail concurrently may create a deadlock if they share files
[Features]
#103082 RFE: Remove tape gateway protocol header files from castor-build-headers.rpm
CASTOR Monitoring
-----------
[Features]
#103176 RFE: properly package the simple log processor plugin for CASTOR
Package Changes
---------------
- castor-slp-plugin is a new package containing the CASTOR plugins to the simple log processor
daemon that is gathering the logs in the new monitoring infrastructure.
It replaces the hand made slp-castor-plugin package that was distributed outside of CASTOR.
Upgrade Instructions from 2.1.14-3
----------------------------------
Stager
------
The upgrade of the stager database to 2.1.14-4 can be performed online while the system is running.
The RPM upgrade can also be performed on a running system, with 2 restrictions:
- the restart of the RH servers may be noticable by few clients (< 1s downtime);
- the restart of the transfermanagers and diskmanagers should not be done concurrently.
Instructions
------------
1. Upgrade the STAGER database using the stager_2.1.14-3_to_2.1.14-4.sql
upgrade script available from:
- http://cern.ch/castor/DIST/CERN/savannah/CASTOR.pkg/2.1.14-*/2.1.14-4/dbupgrades
2. Upgrade the software on the headnodes.
Note: All daemons involved in the upgrade will be restarted automatically.
3. Upgrade the software on the diskservers.
Note: All daemons involved in the upgrade will be restarted automatically.
4. Test the instance by running the test suite available from:
- http://cern.ch/castor/DIST/CERN/savannah/CASTOR.pkg/2.1.14-*/2.1.14-4/testsuite
5. Congratulations you have successfully upgraded to the 2.1.14-4 release
of CASTOR.
Central services (CUPV, VMGR, VDQM, Nameserver)
-----------------------------------------------
The upgrade of the central databases to 2.1.14-4 can be performed online while
the system is running.
Instructions
------------
1. Apply the appropriate database upgrade script from:
- http://cern.ch/castor/DIST/CERN/savannah/CASTOR.pkg/2.1.14-*/2.1.14-4/dbupgrades
2. Update the software to use the 2.1.14-4 RPMs. Note: All
daemons involved in the upgrade will be restarted automatically.
3. Upgrade complete.
------------
- 2.1.14-3 -
......@@ -66,6 +177,7 @@
#102830 RFE: Remove unused accounting from the tape server software
#102831 RFE: Remove the reason argument from tpconfig
#102939 RFE: Remove the castor-sacct package
#103138 RFE: rtcpd should not space the tape backwards and forwards when recalling
[Code Maintenance]
......
......@@ -13,17 +13,26 @@
%define compile_server @COMPILE_SERVER@
%endif
# Neutral packaging (for srpm)
#-----------------------------
%if 0%{?neutralpackage:1} > 0
%define mydist %{nil}
%else
%define mydist %{?dist}
%endif
# General settings
#-----------------
Summary: Cern Advanced mass STORage
Name: %{name}
Version: %{castorVersion}
Release: %{castorRelease}%{?dist}
Source: %{name}-%{version}-%{castorRelease}%{?dist}.tar.gz
Release: %{castorRelease}%{mydist}
Source: %{name}-%{castorVersion}-%{castorRelease}.tar.gz
URL: http://cern.ch/castor
License: http://cern.ch/castor/DIST/CONDITIONS
Group: Application/Castor
BuildRoot: %{_builddir}/%{name}-%{version}-root
BuildRequires: cmake >= 2.6
# only build debug info if you're building the whole code
%if %compile_server
......@@ -38,7 +47,7 @@ BuildRequires: oracle-instantclient-devel
The CASTOR Project stands for CERN Advanced STORage Manager, and its goal is to handle LHC data in a fully distributed environment.
%prep
%setup -q -n %{name}-%{version}-%{release}
%setup -q -n %{name}-%{castorVersion}-%{castorRelease}
%build
......
......@@ -138,6 +138,11 @@ set (CLIENT_LIB_SRC_FILES
io/StreamBWUserCnv.cpp
io/StreamRequestTypeCnv.cpp
io/StreamNsFileIdCnv.cpp
log/IPAddress.cpp
log/Log.cpp
log/Message.cpp
log/Param.cpp
log/TimeStamp.cpp
replier/ClientConnection.cpp
replier/RequestReplier.cpp
rh/Client.cpp
......@@ -312,6 +317,7 @@ if (${COMPILE_SERVER} STREQUAL "1")
exception/Communication.hpp
exception/Exception.hpp
exception/Internal.hpp
exception/InvalidNbArguments.hpp
DESTINATION ${CASTOR_DEST_CPP_HEADERS_DIR}/exception
PERMISSIONS ${CASTOR_HEADER_PERMS})
......
......@@ -125,6 +125,11 @@ STGLIB_SRCS = BaseAddress.cpp \
io/StreamBWUserCnv.cpp \
io/StreamRequestTypeCnv.cpp \
io/StreamNsFileIdCnv.cpp \
log/IPAddress.cpp \
log/Log.cpp \
log/Message.cpp \
log/Param.cpp \
log/TimeStamp.cpp \
replier/ClientConnection.cpp \
replier/RequestReplier.cpp \
rh/Client.cpp \
......@@ -350,6 +355,7 @@ InstallNonExecFile(db/ora/OraCnvSvc.hpp,$(BUILDHEADERSDIRHPP)/db/ora,644)
InstallNonExecFile(db/ora/OraCommonSvc.hpp,$(BUILDHEADERSDIRHPP)/db/ora,644)
InstallNonExecFile(exception/InvalidArgument.hpp,$(BUILDHEADERSDIRHPP)/exception,644)
InstallNonExecFile(exception/InvalidNbArguments.hpp,$(BUILDHEADERSDIRHPP)/exception,644)
InstallNonExecFile(exception/NoEntry.hpp,$(BUILDHEADERSDIRHPP)/exception,644)
InstallNonExecFile(exception/OutOfMemory.hpp,$(BUILDHEADERSDIRHPP)/exception,644)
InstallNonExecFile(exception/SQLError.hpp,$(BUILDHEADERSDIRHPP)/exception,644)
......
......@@ -306,30 +306,22 @@
#DiskManager FSMaxFreeSpace .05
#DiskManager FSMinAllowedFreeSpace .02
# The frequency at which the GC daemon checks the stager to see whether there are
# files to be removed on the diskserver. This value is represented in seconds.
# The interval between two checks of the GC daemon to see whether there are
# files to be removed from a diskserver. This value is represented in seconds.
#GC Interval 300
# The SyncInterval is the frequency in seconds between full rechecks of the
# diskservers contents with the stager catalog and nameserver. I.e once the
# synchronization check is complete, the GC daemon will wait SyncInterval seconds
# before starting the next round of checks. To disable all synchronization checks
# set this value to 0
# The ChunkInterval is the interval in seconds between synchronization queries to
# the stager catalog and nameserver, i.e. the interval between two bulk checks of
# size ChunkSize. To disable all synchronization checks set this value to 0.
#GC SyncInterval 3600
#GC ChunkInterval 1800
# The ChunkSize defines the number of files that the GC daemon should synchronize
# with the stager catalog and nameserver in one go. Note: the largest value is 3000
# with the stager catalog and nameserver in one go. Note: the largest value is 3000.
#GC ChunkSize 2000
# The ChunkInterval is the frequency in seconds between synchronization queries to
# the stager catalog and nameserver. I.e the interval between two bulk checks of
# size ChunkSize
#GC ChunkInterval 1800
# By default the startup of the GC daemon is deliberately offset by a random interval
# between 1 and 15 minutes. This randomized delay should prevent all GC's in an
# castor2 instance from deleting files at the same time causing an oscillation in
......
......@@ -581,7 +581,7 @@ END;
/
ALTER TABLE RecallJob
ADD CONSTRAINT CK_RecallJob_Status
CHECK (status IN (0, 1, 2));
CHECK (status IN (1, 2, 3));
/* Definition of the TapePool table
* name : the name of the TapePool
......@@ -1250,7 +1250,7 @@ ALTER TABLE DrainingErrors
* nsOpenTime : the nsOpenTime of the castorFile when this job was created
* Allows to detect if the file has been overwritten during replication
* destSvcClass : the destination service class
* replicationType : the type of replication involved (user, internal or draining)
* replicationType : the type of replication involved (user, internal, draining or rebalancing)
* replacedDcId : in case of draining, the replaced diskCopy to be dropped
* destDcId : the destination diskCopy
* drainingJob : the draining job behind this d2dJob. Not NULL only if replicationType is DRAINING'
......
......@@ -73,6 +73,8 @@ CREATE OR REPLACE FUNCTION getFileClassName(fileClassId NUMBER) RETURN VARCHAR2
BEGIN
SELECT name INTO varFileClassName FROM FileClass WHERE id = fileClassId;
RETURN varFileClassName;
EXCEPTION WHEN NO_DATA_FOUND THEN
RETURN 'Unknown(' || fileClassId || ')';
END;
/
......@@ -82,6 +84,8 @@ CREATE OR REPLACE FUNCTION getSvcClassName(svcClassId NUMBER) RETURN VARCHAR2 IS
BEGIN
SELECT name INTO varSvcClassName FROM SvcClass WHERE id = svcClassId;
RETURN varSvcClassName;
EXCEPTION WHEN NO_DATA_FOUND THEN
RETURN 'Unknown(' || svcClassId || ')';
END;
/
......@@ -171,6 +175,47 @@ BEGIN
END;
/
/* parse a path to give back the FileSystem and path */
CREATE OR REPLACE PROCEDURE parsePath(inFullPath IN VARCHAR2,
outFileSystem OUT INTEGER,
outPath OUT VARCHAR2,
outDcId OUT INTEGER,
outFileId OUT INTEGER,
outNsHost OUT VARCHAR2) AS
varPathPos INTEGER;
varLastDotPos INTEGER;
varLastSlashPos INTEGER;
varAtPos INTEGER;
varColonPos INTEGER;
varDiskServerName VARCHAR2(2048);
varMountPoint VARCHAR2(2048);
BEGIN
-- path starts after the second '/' from the end
varPathPos := INSTR(inFullPath, '/', -1, 2);
outPath := SUBSTR(inFullPath, varPathPos+1);
-- DcId is the part after the last '.'
varLastDotPos := INSTR(inFullPath, '.', -1, 1);
outDcId := TO_NUMBER(SUBSTR(inFullPath, varLastDotPos+1));
-- the mountPoint is between the ':' and the start of the path
varColonPos := INSTR(inFullPath, ':', 1, 1);
varMountPoint := SUBSTR(inFullPath, varColonPos+1, varPathPos-varColonPos);
-- the diskserver is before the ':
varDiskServerName := SUBSTR(inFullPath, 1, varColonPos-1);
-- the fileid is between last / and '@'
varLastSlashPos := INSTR(inFullPath, '/', -1, 1);
varAtPos := INSTR(inFullPath, '@', 1, 1);
outFileId := TO_NUMBER(SUBSTR(inFullPath, varLastSlashPos+1, varAtPos-varLastSlashPos-1));
-- the nsHost is between '@' and last '.'
outNsHost := SUBSTR(inFullPath, varAtPos+1, varLastDotPos-varAtPos-1);
-- find out the filesystem Id
SELECT FileSystem.id INTO outFileSystem
FROM DiskServer, FileSystem
WHERE DiskServer.name = varDiskServerName
AND FileSystem.diskServer = DiskServer.id
AND FileSystem.mountPoint = varMountPoint;
END;
/
/* Function to check if a diskserver and its given mountpoints have any files
* attached to them.
*/
......
......@@ -10,17 +10,16 @@
CREATE OR REPLACE PACKAGE castorDebug AS
TYPE DiskCopyDebug_typ IS RECORD (
id INTEGER,
status VARCHAR2(2048),
creationtime VARCHAR2(2048),
diskPool VARCHAR2(2048),
location VARCHAR2(2048),
available CHAR(1),
status NUMBER,
creationtime VARCHAR2(2048),
diskCopySize NUMBER,
castorFileSize NUMBER,
gcWeight NUMBER);
TYPE DiskCopyDebug IS TABLE OF DiskCopyDebug_typ;
TYPE SubRequestDebug IS TABLE OF SubRequest%ROWTYPE;
TYPE MigrationJobDebug IS TABLE OF MigrationJob%ROWTYPE;
TYPE RequestDebug_typ IS RECORD (
creationtime VARCHAR2(2048),
SubReqId NUMBER,
......@@ -33,22 +32,48 @@ CREATE OR REPLACE PACKAGE castorDebug AS
TYPE RequestDebug IS TABLE OF RequestDebug_typ;
TYPE RecallJobDebug_typ IS RECORD (
id INTEGER,
status VARCHAR2(2048),
creationtime VARCHAR2(2048),
fseq INTEGER,
copyNb INTEGER,
recallGroup VARCHAR(2048),
svcClass VARCHAR(2048),
euid INTEGER,
egid INTEGER,
vid VARCHAR(2048),
fseq INTEGER,
status INTEGER,
creationTime NUMBER,
nbRetriesWithinMount INTEGER,
nbMounts INTEGER);
TYPE RecallJobDebug IS TABLE OF RecallJobDebug_typ;
TYPE MigrationJobDebug_typ IS RECORD (
id INTEGER,
status VARCHAR2(2048),
creationTime VARCHAR2(2048),
fileSize INTEGER,
tapePoolName VARCHAR2(2048),
destCopyNb INTEGER,
fseq INTEGER,
mountTransactionId INTEGER,
originalVID VARCHAR2(2048),
originalCopyNb INTEGER,
nbRetries INTEGER,
fileTransactionId INTEGER);
TYPE MigrationJobDebug IS TABLE OF MigrationJobDebug_typ;
TYPE Disk2DiskCopyJobDebug_typ IS RECORD (
id INTEGER,
status VARCHAR2(2048),
creationTime VARCHAR2(2048),
transferId VARCHAR2(2048),
retryCounter INTEGER,
nsOpenTime INTEGER,
destSvcClassName VARCHAR2(2048),
replicationType VARCHAR2(2048),
replacedDCId INTEGER,
destDCId INTEGER,
drainingJob INTEGER);
TYPE Disk2DiskCopyJobDebug IS TABLE OF Disk2DiskCopyJobDebug_typ;
END;
/
/* Return the castor file id associated with the reference number */
CREATE OR REPLACE FUNCTION getCF(ref NUMBER) RETURN NUMBER AS
t NUMBER;
......@@ -72,9 +97,13 @@ EXCEPTION WHEN NO_DATA_FOUND THEN -- MigrationJob?
BEGIN
SELECT castorFile INTO cfId FROM MigrationJob WHERE id = ref;
RETURN cfId;
EXCEPTION WHEN NO_DATA_FOUND THEN -- Disk2DiskCopyJob?
BEGIN
SELECT castorFile INTO cfId FROM Disk2DiskCopyJob WHERE id = ref;
RETURN cfId;
EXCEPTION WHEN NO_DATA_FOUND THEN -- nothing found
RAISE_APPLICATION_ERROR (-20000, 'Could not find any CastorFile, SubRequest, DiskCopy, MigrationJob or RecallJob with id = ' || ref);
END; END; END; END; END;
RAISE_APPLICATION_ERROR (-20000, 'Could not find any CastorFile, SubRequest, DiskCopy, MigrationJob, RecallJob or Disk2DiskCopyJob with id = ' || ref);
END; END; END; END; END; END;
/
/* Function to convert seconds into a time string using the format:
......@@ -95,12 +124,11 @@ END;
/* Get the diskcopys associated with the reference number */
CREATE OR REPLACE FUNCTION getDCs(ref number) RETURN castorDebug.DiskCopyDebug PIPELINED AS
BEGIN
FOR d IN (SELECT DiskCopy.id,
FOR d IN (SELECT DiskCopy.id, getObjStatusName('DiskCopy', 'status', DiskCopy.status) AS status,
getTimeString(DiskCopy.creationtime) AS creationtime,
DiskPool.name AS diskpool,
DiskServer.name || ':' || FileSystem.mountPoint || DiskCopy.path AS location,
decode(DiskServer.status, 2, 'N', decode(FileSystem.status, 2, 'N', 'Y')) AS available,
DiskCopy.status AS status,
getTimeString(DiskCopy.creationtime) AS creationtime,
DiskCopy.diskCopySize AS diskcopysize,
CastorFile.fileSize AS castorfilesize,
trunc(DiskCopy.gcWeight, 2) AS gcweight
......@@ -119,9 +147,10 @@ END;
/* Get the recalljobs associated with the reference number */
CREATE OR REPLACE FUNCTION getRJs(ref number) RETURN castorDebug.RecallJobDebug PIPELINED AS
BEGIN
FOR t IN (SELECT RecallJob.id, RecallJob.copyNb, RecallGroup.name as recallGroupName,
FOR t IN (SELECT RecallJob.id, getObjStatusName('RecallJob', 'status', RecallJob.status) as status,
getTimeString(RecallJob.creationTime) as creationTime,
RecallJob.fseq, RecallJob.copyNb, RecallGroup.name as recallGroupName,
SvcClass.name as svcClassName, RecallJob.euid, RecallJob.egid, RecallJob.vid,
RecallJob.fseq, RecallJob.status, RecallJob.creationTime,
RecallJob.nbRetriesWithinMount, RecallJob.nbMounts
FROM RecallJob, RecallGroup, SvcClass
WHERE RecallJob.castorfile = getCF(ref)
......@@ -136,9 +165,35 @@ END;
/* Get the migration jobs associated with the reference number */
CREATE OR REPLACE FUNCTION getMJs(ref number) RETURN castorDebug.MigrationJobDebug PIPELINED AS
BEGIN
FOR t IN (SELECT *
FROM MigrationJob
WHERE castorfile = getCF(ref)) LOOP
FOR t IN (SELECT MigrationJob.id, getObjStatusName('MigrationJob', 'status', MigrationJob.status) as status,
getTimeString(MigrationJob.creationTime) as creationTime,
MigrationJob.fileSize, TapePool.name as tapePoolName,
MigrationJob.destCopyNb, MigrationJob.fseq,
MigrationJob.mountTransactionId,
MigrationJob.originalVID, MigrationJob.originalCopyNb,
MigrationJob.nbRetries, MigrationJob.fileTransactionId
FROM MigrationJob, TapePool
WHERE castorfile = getCF(ref)
AND MigrationJob.tapePool = TapePool.id) LOOP
PIPE ROW(t);
END LOOP;
END;
/
/* Get the (disk2disk) copy jobs associated with the reference number */
CREATE OR REPLACE FUNCTION getCJs(ref number) RETURN castorDebug.Disk2DiskCopyJobDebug PIPELINED AS
BEGIN
FOR t IN (SELECT Disk2DiskCopyJob.id, getObjStatusName('Disk2DiskCopyJob', 'status', Disk2DiskCopyJob.status) as status,
getTimeString(Disk2DiskCopyJob.creationTime) as creationTime,
Disk2DiskCopyJob.transferId, Disk2DiskCopyJob.retryCounter,
Disk2DiskCopyJob.nsOpenTime, SvcClass.name as destSvcClassName,
getObjStatusName('Disk2DiskCopyJob', 'replicationType', Disk2DiskCopyJob.replicationType) as replicationType,
Disk2DiskCopyJob.replacedDCId, Disk2DiskCopyJob.destDCId,
Disk2DiskCopyJob.drainingJob
FROM Disk2DiskCopyJob, SvcClass
WHERE castorfile = getCF(ref)
AND Disk2DiskCopyJob.destSvcClass = SvcClass.id) LOOP
PIPE ROW(t);
END LOOP;
END;
......
......@@ -20,47 +20,65 @@ END;
/* handle the creation of the Disk2DiskCopyJobs for the running drainingJobs */
CREATE OR REPLACE PROCEDURE drainRunner AS
varNbFiles INTEGER := 0;
varNbBytes INTEGER := 0;
varNbFiles INTEGER;
varNbBytes INTEGER;
varNbRunningJobs INTEGER;
varMaxNbOfSchedD2dPerDrain INTEGER;
varUnused INTEGER;
BEGIN
-- get maxNbOfSchedD2dPerDrain
varMaxNbOfSchedD2dPerDrain := TO_NUMBER(getConfigOption('Draining', 'MaxNbSchedD2dPerDrain', '1000'));
-- loop over draining jobs
FOR dj IN (SELECT id, fileSystem, svcClass, fileMask, euid, egid
FROM DrainingJob WHERE status = dconst.DRAININGJOB_RUNNING) LOOP
-- check how many disk2DiskCopyJobs are already running for this draining job
SELECT count(*) INTO varNbRunningJobs FROM Disk2DiskCopyJob WHERE drainingJob = dj.id;
-- Loop over the creation of Disk2DiskCopyJobs. Select max 1000 files, taking running
-- ones into account. Also take the most important jobs first
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.DRAINING_REFILL, 0, '', 'stagerd',
'svcClass=' || getSvcClassName(dj.svcClass) || ' DrainReq=' ||
TO_CHAR(dj.id) || ' MaxNewJobsCount=' || TO_CHAR(varMaxNbOfSchedD2dPerDrain-varNbRunningJobs));
FOR F IN (SELECT * FROM
(SELECT CastorFile.id cfId, Castorfile.nsOpenTime, DiskCopy.id dcId, CastorFile.fileSize
FROM DiskCopy, CastorFile
WHERE DiskCopy.fileSystem = dj.fileSystem
AND CastorFile.id = DiskCopy.castorFile
AND ((dj.fileMask = dconst.DRAIN_FILEMASK_NOTONTAPE AND
CastorFile.tapeStatus IN (dconst.CASTORFILE_NOTONTAPE, dconst.CASTORFILE_DISKONLY)) OR
(dj.fileMask = dconst.DRAIN_FILEMASK_ALL))
AND DiskCopy.status = dconst.DISKCOPY_VALID
AND NOT EXISTS (SELECT 1 FROM Disk2DiskCopyJob WHERE castorFile=CastorFile.id)
ORDER BY DiskCopy.importance DESC)
WHERE ROWNUM <= varMaxNbOfSchedD2dPerDrain-varNbRunningJobs) LOOP
createDisk2DiskCopyJob(F.cfId, F.nsOpenTime, dj.svcClass, dj.euid, dj.egid,
dconst.REPLICATIONTYPE_DRAINING, F.dcId, dj.id);
varNbFiles := varNbFiles + 1;
varNbBytes := varNbBytes + F.fileSize;
END LOOP;
-- commit and update counters
UPDATE DrainingJob
SET totalFiles = totalFiles + varNbFiles,
totalBytes = totalBytes + varNbBytes,
lastModificationTime = getTime()
WHERE id = dj.id;
COMMIT;
DECLARE
CONSTRAINT_VIOLATED EXCEPTION;
PRAGMA EXCEPTION_INIT(CONSTRAINT_VIOLATED, -1);
BEGIN
-- lock the draining Job first
SELECT id INTO varUnused FROM DrainingJob WHERE id = dj.id FOR UPDATE;
-- check how many disk2DiskCopyJobs are already running for this draining job
SELECT count(*) INTO varNbRunningJobs FROM Disk2DiskCopyJob WHERE drainingJob = dj.id;
-- Loop over the creation of Disk2DiskCopyJobs. Select max 1000 files, taking running
-- ones into account. Also take the most important jobs first
logToDLF(NULL, dlf.LVL_SYSTEM, dlf.DRAINING_REFILL, 0, '', 'stagerd',
'svcClass=' || getSvcClassName(dj.svcClass) || ' DrainReq=' ||
TO_CHAR(dj.id) || ' MaxNewJobsCount=' || TO_CHAR(varMaxNbOfSchedD2dPerDrain-varNbRunningJobs));
varNbFiles := 0;
varNbBytes := 0;
FOR F IN (SELECT * FROM
(SELECT CastorFile.id cfId, Castorfile.nsOpenTime, DiskCopy.id dcId, CastorFile.fileSize
FROM DiskCopy, CastorFile
WHERE DiskCopy.fileSystem = dj.fileSystem
AND CastorFile.id = DiskCopy.castorFile
AND ((dj.fileMask = dconst.DRAIN_FILEMASK_NOTONTAPE AND
CastorFile.tapeStatus IN (dconst.CASTORFILE_NOTONTAPE, dconst.CASTORFILE_DISKONLY)) OR
(dj.fileMask = dconst.DRAIN_FILEMASK_ALL))
AND DiskCopy.status = dconst.DISKCOPY_VALID
AND NOT EXISTS (SELECT 1 FROM Disk2DiskCopyJob WHERE castorFile=CastorFile.id)
ORDER BY DiskCopy.importance DESC)
WHERE ROWNUM <= varMaxNbOfSchedD2dPerDrain-varNbRunningJobs) LOOP
createDisk2DiskCopyJob(F.cfId, F.nsOpenTime, dj.svcClass, dj.euid, dj.egid,
dconst.REPLICATIONTYPE_DRAINING, F.dcId, dj.id, FALSE);
varNbFiles := varNbFiles + 1;
varNbBytes := varNbBytes + F.fileSize;
END LOOP;
-- commit and update counters
UPDATE DrainingJob
SET totalFiles = totalFiles + varNbFiles,
totalBytes = totalBytes + varNbBytes,
lastModificationTime = getTime()
WHERE id = dj.id;
COMMIT;
EXCEPTION WHEN CONSTRAINT_VIOLATED THEN
-- check that the constraint violated is due to deletion of the drainingJob
IF sqlerrm LIKE '%constraint (CASTOR_STAGER.FK_DISK2DISKCOPYJOB_DRAINJOB) violated%' THEN
-- give up with this DrainingJob as it was canceled
ROLLBACK;
ELSE
raise;
END IF;
END;
END LOOP;
END;
/
......@@ -131,7 +149,7 @@ BEGIN
-- create disk2DiskCopyJob for this diskCopy
createDisk2DiskCopyJob(varCfId, varNsOpenTime, inDestSvcClassId,
0, 0, dconst.REPLICATIONTYPE_REBALANCE,
varDcId, NULL);
varDcId, NULL, FALSE);
END LOOP;
CLOSE DCcur;
-- "rebalancing : stopping" message
......
......@@ -418,6 +418,9 @@ BEGIN
FOR cf IN (SELECT cfId, dcId
FROM filesDeletedProcHelper
ORDER BY cfId ASC) LOOP
DECLARE
CONSTRAINT_VIOLATED EXCEPTION;
PRAGMA EXCEPTION_INIT(CONSTRAINT_VIOLATED, -1);
BEGIN
-- Get data and lock the castorFile
SELECT fileId, nsHost, fileClass
......@@ -476,6 +479,15 @@ BEGIN
-- There is thus no way to find out whether to remove the
-- file from the nameserver. For safety, we thus keep it
NULL;
WHEN CONSTRAINT_VIOLATED THEN
IF sqlerrm LIKE '%constraint (CASTOR_STAGER.FK_DISK2DISKCOPYJOB_CASTORFILE) violated%' THEN
-- Ignore the deletion, probably some draining/rebalancing activity created a Disk2DiskCopyJob entity
-- while we were attempting to drop the CastorFile
NULL;