oracleGC.sql 30.2 KB
Newer Older
1
/*******************************************************************
2
3
 *
 *
4
 * PL/SQL code for stager cleanup and garbage collecting
5
6
7
 *
 * @author Castor Dev team, castor-dev@cern.ch
 *******************************************************************/
8

9
10
11
12
13
/* PL/SQL declaration for the castorGC package */
CREATE OR REPLACE PACKAGE castorGC AS
  TYPE SelectFiles2DeleteLine IS RECORD (
        path VARCHAR2(2048),
        id NUMBER,
14
        fileId NUMBER,
15
16
17
18
        nsHost VARCHAR2(2048),
        lastAccessTime INTEGER,
        nbAccesses NUMBER,
        gcWeight NUMBER,
19
20
        gcTriggeredBy VARCHAR2(2048),
        svcClassName VARCHAR2(2048));
21
  TYPE SelectFiles2DeleteLine_Cur IS REF CURSOR RETURN SelectFiles2DeleteLine;
22
23
24
25
26
27
28
  -- find out a gc function to be used from a given serviceClass
  FUNCTION getUserWeight(svcClassId NUMBER) RETURN VARCHAR2;
  FUNCTION getRecallWeight(svcClassId NUMBER) RETURN VARCHAR2;
  FUNCTION getCopyWeight(svcClassId NUMBER) RETURN VARCHAR2;
  FUNCTION getFirstAccessHook(svcClassId NUMBER) RETURN VARCHAR2;
  FUNCTION getAccessHook(svcClassId NUMBER) RETURN VARCHAR2;
  FUNCTION getUserSetGCWeight(svcClassId NUMBER) RETURN VARCHAR2;
29
30
31
  -- compute gcWeight from size
  FUNCTION size2GCWeight(s NUMBER) RETURN NUMBER;
  -- Default gc policy
32
  FUNCTION sizeRelatedUserWeight(fileSize NUMBER) RETURN NUMBER;
33
  FUNCTION sizeRelatedRecallWeight(fileSize NUMBER) RETURN NUMBER;
34
  FUNCTION sizeRelatedCopyWeight(fileSize NUMBER) RETURN NUMBER;
35
36
37
38
  FUNCTION dayBonusFirstAccessHook(oldGcWeight NUMBER, creationTime NUMBER) RETURN NUMBER;
  FUNCTION halfHourBonusAccessHook(oldGcWeight NUMBER, creationTime NUMBER, nbAccesses NUMBER) RETURN NUMBER;
  FUNCTION cappedUserSetGCWeight(oldGcWeight NUMBER, userDelta NUMBER) RETURN NUMBER;
  -- FIFO gc policy
39
  FUNCTION creationTimeUserWeight(fileSize NUMBER) RETURN NUMBER;
40
  FUNCTION creationTimeRecallWeight(fileSize NUMBER) RETURN NUMBER;
41
  FUNCTION creationTimeCopyWeight(fileSize NUMBER) RETURN NUMBER;
42
43
44
  -- LRU gc policy
  FUNCTION LRUFirstAccessHook(oldGcWeight NUMBER, creationTime NUMBER) RETURN NUMBER;
  FUNCTION LRUAccessHook(oldGcWeight NUMBER, creationTime NUMBER, nbAccesses NUMBER) RETURN NUMBER;
45
  FUNCTION LRUpinUserSetGCWeight(oldGcWeight NUMBER, userDelta NUMBER) RETURN NUMBER;
46
END castorGC;
Dennis Waldron's avatar
Dennis Waldron committed
47
/
48

49
50
CREATE OR REPLACE PACKAGE BODY castorGC AS

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
  FUNCTION getUserWeight(svcClassId NUMBER) RETURN VARCHAR2 AS
    ret VARCHAR2(2048);
  BEGIN
    SELECT userWeight INTO ret
      FROM SvcClass, GcPolicy
     WHERE SvcClass.id = svcClassId
       AND SvcClass.gcPolicy = GcPolicy.name;
    RETURN ret;
  EXCEPTION WHEN NO_DATA_FOUND THEN
    -- we did not get any policy, let's go for the default
    SELECT userWeight INTO ret
      FROM GcPolicy
     WHERE GcPolicy.name = 'default';
    RETURN ret;
  END;

  FUNCTION getRecallWeight(svcClassId NUMBER) RETURN VARCHAR2 AS
    ret VARCHAR2(2048);
  BEGIN
    SELECT recallWeight INTO ret
      FROM SvcClass, GcPolicy
     WHERE SvcClass.id = svcClassId
       AND SvcClass.gcPolicy = GcPolicy.name;
    RETURN ret;
  EXCEPTION WHEN NO_DATA_FOUND THEN
    -- we did not get any policy, let's go for the default
    SELECT recallWeight INTO ret
      FROM GcPolicy
     WHERE GcPolicy.name = 'default';
    RETURN ret;
  END;

  FUNCTION getCopyWeight(svcClassId NUMBER) RETURN VARCHAR2 AS
    ret VARCHAR2(2048);
  BEGIN
    SELECT copyWeight INTO ret
      FROM SvcClass, GcPolicy
     WHERE SvcClass.id = svcClassId
       AND SvcClass.gcPolicy = GcPolicy.name;
    RETURN ret;
  EXCEPTION WHEN NO_DATA_FOUND THEN
    -- we did not get any policy, let's go for the default
    SELECT copyWeight INTO ret
      FROM GcPolicy
     WHERE GcPolicy.name = 'default';
    RETURN ret;
  END;

  FUNCTION getFirstAccessHook(svcClassId NUMBER) RETURN VARCHAR2 AS
    ret VARCHAR2(2048);
  BEGIN
    SELECT firstAccessHook INTO ret
      FROM SvcClass, GcPolicy
     WHERE SvcClass.id = svcClassId
       AND SvcClass.gcPolicy = GcPolicy.name;
    RETURN ret;
  EXCEPTION WHEN NO_DATA_FOUND THEN
    RETURN NULL;
  END;

  FUNCTION getAccessHook(svcClassId NUMBER) RETURN VARCHAR2 AS
    ret VARCHAR2(2048);
  BEGIN
    SELECT accessHook INTO ret
      FROM SvcClass, GcPolicy
     WHERE SvcClass.id = svcClassId
       AND SvcClass.gcPolicy = GcPolicy.name;
    RETURN ret;
  EXCEPTION WHEN NO_DATA_FOUND THEN
    RETURN NULL;
  END;

  FUNCTION getUserSetGCWeight(svcClassId NUMBER) RETURN VARCHAR2 AS
    ret VARCHAR2(2048);
  BEGIN
    SELECT userSetGCWeight INTO ret
      FROM SvcClass, GcPolicy
     WHERE SvcClass.id = svcClassId
       AND SvcClass.gcPolicy = GcPolicy.name;
    RETURN ret;
  EXCEPTION WHEN NO_DATA_FOUND THEN
    RETURN NULL;
  END;

  FUNCTION size2GCWeight(s NUMBER) RETURN NUMBER IS
136
137
138
139
140
141
142
143
  BEGIN
    IF s < 1073741824 THEN
      RETURN 1073741824/(s+1)*86400 + getTime();  -- 1GB/filesize (days) + current time as lastAccessTime
    ELSE
      RETURN 86400 + getTime();  -- the value for 1G file. We do not make any difference for big files and privilege FIFO
    END IF;
  END;

144
  FUNCTION sizeRelatedUserWeight(fileSize NUMBER) RETURN NUMBER AS
145
146
147
148
  BEGIN
    RETURN size2GCWeight(fileSize);
  END;

149
  FUNCTION sizeRelatedRecallWeight(fileSize NUMBER) RETURN NUMBER AS
150
151
152
153
  BEGIN
    RETURN size2GCWeight(fileSize);
  END;

154
  FUNCTION sizeRelatedCopyWeight(fileSize NUMBER) RETURN NUMBER AS
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  BEGIN
    RETURN size2GCWeight(fileSize);
  END;

  FUNCTION dayBonusFirstAccessHook(oldGcWeight NUMBER, creationTime NUMBER) RETURN NUMBER AS
  BEGIN
    RETURN oldGcWeight - 86400;
  END;

  FUNCTION halfHourBonusAccessHook(oldGcWeight NUMBER, creationTime NUMBER, nbAccesses NUMBER) RETURN NUMBER AS
  BEGIN
    RETURN oldGcWeight + 1800;
  END;

  FUNCTION cappedUserSetGCWeight(oldGcWeight NUMBER, userDelta NUMBER) RETURN NUMBER AS
  BEGIN
    IF userDelta >= 18000 THEN -- 5h max
      RETURN oldGcWeight + 18000;
    ELSE
      RETURN oldGcWeight + userDelta;
    END IF;
  END;

  -- FIFO gc policy
179
  FUNCTION creationTimeUserWeight(fileSize NUMBER) RETURN NUMBER AS
180
181
182
183
  BEGIN
    RETURN getTime();
  END;

184
  FUNCTION creationTimeRecallWeight(fileSize NUMBER) RETURN NUMBER AS
185
186
187
188
  BEGIN
    RETURN getTime();
  END;

189
  FUNCTION creationTimeCopyWeight(fileSize NUMBER) RETURN NUMBER AS
190
191
192
193
  BEGIN
    RETURN getTime();
  END;

194
  -- LRU and LRUpin gc policy
195
196
197
198
199
200
201
202
203
204
  FUNCTION LRUFirstAccessHook(oldGcWeight NUMBER, creationTime NUMBER) RETURN NUMBER AS
  BEGIN
    RETURN getTime();
  END;

  FUNCTION LRUAccessHook(oldGcWeight NUMBER, creationTime NUMBER, nbAccesses NUMBER) RETURN NUMBER AS
  BEGIN
    RETURN getTime();
  END;

205
206
207
208
209
210
211
212
213
  FUNCTION LRUpinUserSetGCWeight(oldGcWeight NUMBER, userDelta NUMBER) RETURN NUMBER AS
  BEGIN
    IF userDelta >= 2592000 THEN -- 30 days max
      RETURN oldGcWeight + 2592000;
    ELSE
      RETURN oldGcWeight + userDelta;
    END IF;
  END;

214
END castorGC;
Dennis Waldron's avatar
Dennis Waldron committed
215
/
216

217
/* PL/SQL method implementing selectFiles2Delete
218
219
   This is the standard garbage collector: it sorts VALID diskcopies
   that do not need to go to tape by gcWeight and selects them for deletion up to
220
   the desired free space watermark */
221
222
223
CREATE OR REPLACE
PROCEDURE selectFiles2Delete(diskServerName IN VARCHAR2,
                             files OUT castorGC.SelectFiles2DeleteLine_Cur) AS
224
225
226
227
228
  dcIds "numList";
  freed INTEGER;
  deltaFree INTEGER;
  toBeFreed INTEGER;
  dontGC INTEGER;
229
  totalCount INTEGER;
230
  unused INTEGER;
231
  backoff INTEGER;
232
233
  CastorFileLocked EXCEPTION;
  PRAGMA EXCEPTION_INIT (CastorFileLocked, -54);
234
BEGIN
235
  -- First of all, check if we are in a Disk1 pool
236
  dontGC := 0;
237
  FOR sc IN (SELECT disk1Behavior
238
               FROM SvcClass, DiskPool2SvcClass D2S, DiskServer, FileSystem
239
              WHERE SvcClass.id = D2S.child
240
241
                AND D2S.parent = FileSystem.diskPool
                AND FileSystem.diskServer = DiskServer.id
242
                AND DiskServer.name = diskServerName) LOOP
243
    -- If any of the service classes to which we belong (normally a single one)
244
    -- say this is Disk1, we don't GC files.
245
    IF sc.disk1Behavior = 1 THEN
246
247
248
249
      dontGC := 1;
      EXIT;
    END IF;
  END LOOP;
250
251
252

  -- Loop on all concerned fileSystems in a random order.
  totalCount := 0;
253
254
255
  FOR fs IN (SELECT FileSystem.id
               FROM FileSystem, DiskServer
              WHERE FileSystem.diskServer = DiskServer.id
256
257
                AND DiskServer.name = diskServerName
             ORDER BY dbms_random.value) LOOP
258

259
260
261
    -- Count the number of diskcopies on this filesystem that are in a
    -- BEINGDELETED state. These need to be reselected in any case.
    freed := 0;
262
    SELECT totalCount + count(*), nvl(sum(DiskCopy.diskCopySize), 0)
263
      INTO totalCount, freed
264
265
      FROM DiskCopy
     WHERE DiskCopy.fileSystem = fs.id
266
       AND decode(status, 9, status, NULL) = 9;  -- BEINGDELETED (decode used to use function-based index)
267

268
    -- estimate the number of GC running the "long" query, that is the one dealing with the GCing of
269
    -- VALID files.
270
271
272
273
    SELECT COUNT(*) INTO backoff
      FROM v$session s, v$sqltext t
     WHERE s.sql_id = t.sql_id AND t.sql_text LIKE '%I_DiskCopy_FS_GCW%';

274
    -- Process diskcopies that are in an INVALID state.
275
    UPDATE /*+ INDEX_RS_ASC(DiskCopy I_DiskCopy_Status_7_FS)) */ DiskCopy
276
       SET status = 9, -- BEINGDELETED
277
           gcType = decode(gcType, NULL, dconst.GCTYPE_USER, gcType)
278
     WHERE fileSystem = fs.id
279
       AND decode(status, 7, status, NULL) = 7  -- INVALID (decode used to use function-based index)
280
       AND rownum <= 10000 - totalCount
281
282
    RETURNING id BULK COLLECT INTO dcIds;
    COMMIT;
283

284
    -- If we have more than 10,000 files to GC, exit the loop. There is no point
285
286
    -- processing more as the maximum sent back to the client in one call is
    -- 10,000. This protects the garbage collector from being overwhelmed with
287
288
289
    -- requests and reduces the stager DB load. Furthermore, if too much data is
    -- sent back to the client, the transfer time between the stager and client
    -- becomes very long and the message may timeout or may not even fit in the
290
    -- clients receive buffer!
291
292
293
    totalCount := totalCount + dcIds.COUNT();
    EXIT WHEN totalCount >= 10000;

294
    -- Continue processing but with VALID files, only in case we are not already loaded
295
    IF dontGC = 0 AND backoff < 4 THEN
296
      -- Do not delete VALID files from non production hardware
297
298
299
300
      BEGIN
        SELECT FileSystem.id INTO unused
          FROM DiskServer, FileSystem
         WHERE FileSystem.id = fs.id
301
           AND FileSystem.status IN (dconst.FILESYSTEM_PRODUCTION, dconst.FILESYSTEM_READONLY)
302
           AND FileSystem.diskserver = DiskServer.id
303
304
           AND DiskServer.status IN (dconst.DISKSERVER_PRODUCTION, dconst.DISKSERVER_READONLY)
           AND DiskServer.hwOnline = 1;
305
306
307
      EXCEPTION WHEN NO_DATA_FOUND THEN
        EXIT;
      END;
308
309
      -- Calculate the amount of space that would be freed on the filesystem
      -- if the files selected above were to be deleted.
310
      IF dcIds.COUNT > 0 THEN
311
        SELECT /*+ INDEX(DiskCopy PK_DiskCopy_Id) */ freed + sum(diskCopySize) INTO freed
312
          FROM DiskCopy
313
314
         WHERE DiskCopy.id IN
             (SELECT /*+ CARDINALITY(fsidTable 5) */ *
315
                FROM TABLE(dcIds) dcidTable);
316
      END IF;
317
318
319
320
321
322
      -- Get the amount of space to be liberated
      SELECT decode(sign(maxFreeSpace * totalSize - free), -1, 0, maxFreeSpace * totalSize - free)
        INTO toBeFreed
        FROM FileSystem
       WHERE id = fs.id;
      -- If space is still required even after removal of INVALID files, consider
323
      -- removing VALID files until we are below the free space watermark
324
      IF freed < toBeFreed THEN
325
        -- Loop on file deletions
326
        FOR dc IN (SELECT /*+ INDEX_RS_ASC(DiskCopy I_DiskCopy_FS_GCW) */ DiskCopy.id, castorFile
327
                     FROM DiskCopy, CastorFile
328
                    WHERE fileSystem = fs.id
329
330
331
                      AND status = dconst.DISKCOPY_VALID
                      AND CastorFile.id = DiskCopy.castorFile
                      AND CastorFile.tapeStatus IN (dconst.CASTORFILE_DISKONLY, dconst.CASTORFILE_ONTAPE)
332
333
334
335
336
337
338
                      ORDER BY gcWeight ASC) LOOP
          BEGIN
            -- Lock the CastorFile
            SELECT id INTO unused FROM CastorFile
             WHERE id = dc.castorFile FOR UPDATE NOWAIT;
            -- Mark the DiskCopy as being deleted
            UPDATE DiskCopy
339
340
               SET status = dconst.DISKCOPY_BEINGDELETED,
                   gcType = dconst.GCTYPE_AUTO
341
342
343
344
             WHERE id = dc.id RETURNING diskCopySize INTO deltaFree;
            totalCount := totalCount + 1;
            -- Update freed space
            freed := freed + deltaFree;
345
346
347
348
349
            -- update importance of remianing copies of the file if any
            UPDATE DiskCopy
               SET importance = importance + 1
             WHERE castorFile = dc.castorFile
               AND status = dconst.DISKCOPY_VALID;
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
            -- Shall we continue ?
            IF toBeFreed <= freed THEN
              EXIT;
            END IF;
            IF totalCount >= 10000 THEN
              EXIT;
            END IF;           
          EXCEPTION
            WHEN NO_DATA_FOUND THEN
              -- The file no longer exists or has the wrong state
              NULL;
            WHEN CastorFileLocked THEN
              -- Go to the next candidate, processing is taking place on the
              -- file
              NULL;
          END;
366
          COMMIT;
367
368
369
        END LOOP;
      END IF;
    END IF;
370
371
    -- We have enough files to exit the loop ?
    EXIT WHEN totalCount >= 10000;
372
  END LOOP;
373

374
  -- Now select all the BEINGDELETED diskcopies in this diskserver for the GC daemon
375
  OPEN files FOR
376
    SELECT /*+ INDEX(CastorFile PK_CastorFile_ID) */ FileSystem.mountPoint || DiskCopy.path,
377
378
379
           DiskCopy.id,
           Castorfile.fileid, Castorfile.nshost,
           DiskCopy.lastAccessTime, DiskCopy.nbCopyAccesses, DiskCopy.gcWeight,
380
           getObjStatusName('DiskCopy', 'gcType', DiskCopy.gcType),
Giuseppe Lo Presti's avatar
Giuseppe Lo Presti committed
381
           getSvcClassList(FileSystem.id)
382
383
384
385
386
387
388
      FROM CastorFile, DiskCopy, FileSystem, DiskServer
     WHERE decode(DiskCopy.status, 9, DiskCopy.status, NULL) = 9 -- BEINGDELETED
       AND DiskCopy.castorfile = CastorFile.id
       AND DiskCopy.fileSystem = FileSystem.id
       AND FileSystem.diskServer = DiskServer.id
       AND DiskServer.name = diskServerName
       AND rownum <= 10000;
389
END;
Dennis Waldron's avatar
Dennis Waldron committed
390
/
391

392

393
394
395
396
397
398
399
400
401
402
403
404
/*
 * PL/SQL method implementing filesDeleted
 * Note that we don't increase the freespace of the fileSystem.
 * This is done by the monitoring daemon, that knows the
 * exact amount of free space.
 * dcIds gives the list of diskcopies to delete.
 * fileIds returns the list of castor files to be removed
 * from the name server
 */
CREATE OR REPLACE PROCEDURE filesDeletedProc
(dcIds IN castor."cnumList",
 fileIds OUT castor.FileList_Cur) AS
405
406
407
408
  fid NUMBER;
  fc NUMBER;
  nsh VARCHAR2(2048);
  nb INTEGER;
409
410
BEGIN
  IF dcIds.COUNT > 0 THEN
411
    -- List the castorfiles to be cleaned up afterwards
412
    FORALL i IN dcIds.FIRST .. dcIds.LAST
413
414
      INSERT INTO FilesDeletedProcHelper (cfId, dcId) (
        SELECT castorFile, id FROM DiskCopy
415
         WHERE id = dcIds(i));
416
    -- Use a normal loop to clean castorFiles. Note: We order the list to
417
    -- prevent a deadlock
418
    FOR cf IN (SELECT cfId, dcId
419
420
                 FROM filesDeletedProcHelper
                ORDER BY cfId ASC) LOOP
421
422
423
      DECLARE
        CONSTRAINT_VIOLATED EXCEPTION;
        PRAGMA EXCEPTION_INIT(CONSTRAINT_VIOLATED, -1);
424
      BEGIN
425
426
427
428
        -- Get data and lock the castorFile
        SELECT fileId, nsHost, fileClass
          INTO fid, nsh, fc
          FROM CastorFile
429
         WHERE id = cf.cfId FOR UPDATE;
430
431
        -- delete the original diskcopy to be dropped
        DELETE FROM DiskCopy WHERE id = cf.dcId;
432
        -- Cleanup:
433
434
        -- See whether it has any other DiskCopy or any new Recall request:
        -- if so, skip the rest
435
436
        SELECT count(*) INTO nb FROM DiskCopy
         WHERE castorFile = cf.cfId;
437
438
439
        IF nb > 0 THEN
          CONTINUE;
        END IF;
440
        SELECT /*+ INDEX_RS_ASC(Subrequest I_Subrequest_Castorfile)*/ count(*) INTO nb
441
442
443
444
445
446
          FROM SubRequest
         WHERE castorFile = cf.cfId
           AND status = dconst.SUBREQUEST_WAITTAPERECALL;
        IF nb > 0 THEN
          CONTINUE;
        END IF;
447
448
449
450
451
452
        -- Now check for Disk2DiskCopy jobs
        SELECT /*+ INDEX(I_Disk2DiskCopyJob_cfId) */ count(*) INTO nb FROM Disk2DiskCopyJob
         WHERE castorFile = cf.cfId;
        IF nb > 0 THEN
          CONTINUE;
        END IF;
453
        -- Nothing found, check for any other subrequests
454
        SELECT /*+ INDEX_RS_ASC(Subrequest I_Subrequest_Castorfile)*/ count(*) INTO nb
455
456
          FROM SubRequest
         WHERE castorFile = cf.cfId
457
           AND status IN (1, 2, 3, 4, 5, 6, 7, 10, 12, 13);  -- all but START, FINISHED, FAILED_FINISHED, ARCHIVED
458
        IF nb = 0 THEN
459
460
461
462
          -- Nothing left, delete the CastorFile
          DELETE FROM CastorFile WHERE id = cf.cfId;
        ELSE
          -- Fail existing subrequests for this file
463
          UPDATE /*+ INDEX_RS_ASC(Subrequest I_Subrequest_Castorfile)*/ SubRequest
464
             SET status = dconst.SUBREQUEST_FAILED
465
           WHERE castorFile = cf.cfId
466
             AND status IN (1, 2, 3, 4, 5, 6, 12, 13);  -- same as above
467
468
469
470
471
472
473
474
475
        END IF;
        -- Check whether this file potentially had copies on tape
        SELECT nbCopies INTO nb FROM FileClass WHERE id = fc;
        IF nb = 0 THEN
          -- This castorfile was created with no copy on tape
          -- So removing it from the stager means erasing
          -- it completely. We should thus also remove it
          -- from the name server
          INSERT INTO FilesDeletedProcOutput (fileId, nsHost) VALUES (fid, nsh);
476
        END IF;
477
478
479
480
      EXCEPTION WHEN NO_DATA_FOUND THEN
        -- Ignore, this means that the castorFile did not exist.
        -- There is thus no way to find out whether to remove the
        -- file from the nameserver. For safety, we thus keep it
481
        NULL;
482
      WHEN CONSTRAINT_VIOLATED THEN
483
484
485
        IF sqlerrm LIKE '%constraint (CASTOR_STAGER.FK_%_CASTORFILE) violated%' THEN
          -- Ignore the deletion, probably some draining/rebalancing/recall activity created
	  -- a new Disk2DiskCopyJob/RecallJob entity while we were attempting to drop the CastorFile
486
487
488
489
490
          NULL;
        ELSE
          -- Any other constraint violation is an error
          RAISE;
        END IF;
491
      END;
492
    END LOOP;
493
  END IF;
494
495
  OPEN fileIds FOR
    SELECT fileId, nsHost FROM FilesDeletedProcOutput;
496
END;
Dennis Waldron's avatar
Dennis Waldron committed
497
/
498
499
500
501
502
503
504
505
506
507
508
509
510

/* PL/SQL method implementing filesDeletionFailedProc */
CREATE OR REPLACE PROCEDURE filesDeletionFailedProc
(dcIds IN castor."cnumList") AS
  cfId NUMBER;
BEGIN
  IF dcIds.COUNT > 0 THEN
    -- Loop over the files
    FORALL i IN dcIds.FIRST .. dcIds.LAST
      UPDATE DiskCopy SET status = 4 -- FAILED
       WHERE id = dcIds(i);
  END IF;
END;
Dennis Waldron's avatar
Dennis Waldron committed
511
/
512
513
514
515
516



/* PL/SQL method implementing nsFilesDeletedProc */
CREATE OR REPLACE PROCEDURE nsFilesDeletedProc
517
(nh IN VARCHAR2,
518
519
520
 fileIds IN castor."cnumList",
 orphans OUT castor.IdRecord_Cur) AS
  unused NUMBER;
521
  nsHostName VARCHAR2(2048);
522
523
524
525
BEGIN
  IF fileIds.COUNT <= 0 THEN
    RETURN;
  END IF;
526
527
  -- Get the stager/nsHost configuration option
  nsHostName := getConfigOption('stager', 'nsHost', nh);
528
529
530
531
532
  -- Loop over the deleted files and split the orphan ones
  -- from the normal ones
  FOR fid in fileIds.FIRST .. fileIds.LAST LOOP
    BEGIN
      SELECT id INTO unused FROM CastorFile
533
       WHERE fileid = fileIds(fid) AND nsHost = nsHostName;
534
      stageForcedRm(fileIds(fid), nsHostName, dconst.GCTYPE_NSSYNCH);
535
536
537
538
    EXCEPTION WHEN NO_DATA_FOUND THEN
      -- this file was dropped from nameServer AND stager
      -- and still exists on disk. We put it into the list
      -- of orphan fileids to return
539
      INSERT INTO NsFilesDeletedOrphans (fileId) VALUES (fileIds(fid));
540
541
542
543
544
    END;
  END LOOP;
  -- return orphan ones
  OPEN orphans FOR SELECT * FROM NsFilesDeletedOrphans;
END;
Dennis Waldron's avatar
Dennis Waldron committed
545
/
546
547
548
549
550
551
552
553
554
555
556
557
558


/* PL/SQL method implementing stgFilesDeletedProc */
CREATE OR REPLACE PROCEDURE stgFilesDeletedProc
(dcIds IN castor."cnumList",
 stgOrphans OUT castor.IdRecord_Cur) AS
  unused NUMBER;
BEGIN
  -- Nothing to do
  IF dcIds.COUNT <= 0 THEN
    RETURN;
  END IF;
  -- Insert diskcopy ids into a temporary table
559
  FORALL i IN dcIds.FIRST..dcIds.LAST
560
   INSERT INTO StgFilesDeletedOrphans (diskCopyId) VALUES (dcIds(i));
561
562
563
564
  -- Return a list of diskcopy ids which no longer exist
  OPEN stgOrphans FOR
    SELECT diskCopyId FROM StgFilesDeletedOrphans
     WHERE NOT EXISTS (
565
        SELECT /*+ INDEX(DiskCopy PK_DiskCopy_Id) */ 'x' FROM DiskCopy
566
567
         WHERE id = diskCopyId);
END;
Dennis Waldron's avatar
Dennis Waldron committed
568
/
569
570
571
572


/** Cleanup job **/

573
574
/* A little generic method to delete efficiently */
CREATE OR REPLACE PROCEDURE bulkDelete(sel IN VARCHAR2, tab IN VARCHAR2) AS
575
BEGIN
576
577
  EXECUTE IMMEDIATE
  'DECLARE
578
    CURSOR s IS '||sel||'
579
580
    ids "numList";
  BEGIN
581
    LOOP
582
583
      OPEN s;
      FETCH s BULK COLLECT INTO ids LIMIT 100000;
584
      EXIT WHEN ids.count = 0;
585
      FORALL i IN 1 .. ids.COUNT
586
        DELETE FROM '||tab||' WHERE id = ids(i);
587
      CLOSE s;
588
      COMMIT;
589
    END LOOP;
590
591
  END;';
END;
Dennis Waldron's avatar
Dennis Waldron committed
592
/
593

594
/* A generic method to delete requests of a given type */
595
CREATE OR REPLACE Procedure bulkDeleteRequests(reqType IN VARCHAR, timeOut IN INTEGER) AS
596
BEGIN
597
  bulkDelete('SELECT id FROM '|| reqType ||' R WHERE
598
599
    NOT EXISTS (SELECT 1 FROM SubRequest WHERE request = R.id) AND lastModificationTime < getTime() - '
    || timeOut ||';',
600
    reqType);
601
END;
Dennis Waldron's avatar
Dennis Waldron committed
602
/
603

604
/* Search and delete old archived/failed subrequests and their requests */
605
CREATE OR REPLACE PROCEDURE deleteTerminatedRequests AS
606
607
  timeOut INTEGER;
  rate INTEGER;
608
  srIds "numList";
609
  ct NUMBER;
610
BEGIN
611
  -- select requested timeout from configuration table
612
  timeout := 3600*TO_NUMBER(getConfigOption('cleaning', 'terminatedRequestsTimeout', '120'));
613
614
615
616
617
  -- get a rough estimate of the current request processing rate
  SELECT count(*) INTO rate
    FROM SubRequest
   WHERE status IN (9, 11)  -- FAILED_FINISHED, ARCHIVED
     AND lastModificationTime > getTime() - 1800;
618
619
  IF rate > 0 AND (1000000 / rate * 1800) < timeOut THEN
    timeOut := 1000000 / rate * 1800;  -- keep 1M requests max
620
  END IF;
621
622
  
  -- delete castorFiles if nothing is left for them. Here we use
623
  -- a temporary table as we need to commit every ~1000 operations
624
625
626
  -- and keeping a cursor opened on the original select may take
  -- too long, leading to ORA-01555 'snapshot too old' errors.
  EXECUTE IMMEDIATE 'TRUNCATE TABLE DeleteTermReqHelper';
627
  INSERT /*+ APPEND */ INTO DeleteTermReqHelper (srId, cfId)
628
629
630
    (SELECT id, castorFile FROM SubRequest
      WHERE status IN (9, 11)
        AND lastModificationTime < getTime() - timeOut);
Giuseppe Lo Presti's avatar
Giuseppe Lo Presti committed
631
  COMMIT;
632
633
634
635
636
637
638
639
640
  ct := 0;
  FOR cf IN (SELECT UNIQUE cfId FROM DeleteTermReqHelper) LOOP
    deleteCastorFile(cf.cfId);
    ct := ct + 1;
    IF ct = 1000 THEN
      COMMIT;
      ct := 0;
    END IF;
  END LOOP;
641

642
643
644
645
646
  -- now delete all old subRequest. We reuse here the
  -- temporary table, which serves as a snapshot of the
  -- entries to be deleted, and we use the FORALL logic
  -- (cf. bulkDelete) instead of a simple DELETE ...
  -- WHERE id IN (SELECT srId FROM DeleteTermReqHelper)
Giuseppe Lo Presti's avatar
Giuseppe Lo Presti committed
647
648
649
650
  -- for efficiency reasons. Moreover, we don't risk
  -- here the ORA-01555 error keeping the cursor open
  -- between commits as we are selecting on our
  -- temporary table.
651
  DECLARE
652
653
    CURSOR s IS
      SELECT srId FROM DeleteTermReqHelper;
654
655
    ids "numList";
  BEGIN
656
    OPEN s;
Giuseppe Lo Presti's avatar
Giuseppe Lo Presti committed
657
658
    LOOP
      FETCH s BULK COLLECT INTO ids LIMIT 10000;
659
      EXIT WHEN ids.count = 0;
660
      FORALL i IN 1 .. ids.COUNT
661
        DELETE FROM SubRequest WHERE id = ids(i);
Giuseppe Lo Presti's avatar
Giuseppe Lo Presti committed
662
663
      COMMIT;
    END LOOP;
664
    CLOSE s;
665
  END;
666
  EXECUTE IMMEDIATE 'TRUNCATE TABLE DeleteTermReqHelper';
667

668
669
670
671
672
  -- And then related Requests, now orphaned.
  -- The timeout makes sure we keep very recent requests,
  -- even if they have no subrequests. This may actually
  -- be the case only for Repack requests, as they're
  -- created empty and filled after querying the NS.
673
    ---- Get ----
674
  bulkDeleteRequests('StageGetRequest', timeOut);
675
    ---- Put ----
676
  bulkDeleteRequests('StagePutRequest', timeOut);
677
    ---- Update ----
678
  bulkDeleteRequests('StageUpdateRequest', timeOut);
679
    ---- PrepareToGet -----
680
  bulkDeleteRequests('StagePrepareToGetRequest', timeOut);
681
    ---- PrepareToPut ----
682
  bulkDeleteRequests('StagePrepareToPutRequest', timeOut);
683
    ---- PrepareToUpdate ----
684
  bulkDeleteRequests('StagePrepareToUpdateRequest', timeOut);
685
    ---- PutDone ----
686
  bulkDeleteRequests('StagePutDoneRequest', timeOut);
687
    ---- Rm ----
688
  bulkDeleteRequests('StageRmRequest', timeOut);
689
    ---- Repack ----
690
  bulkDeleteRequests('StageRepackRequest', timeOut);
691
    ---- SetGCWeight ----
692
  bulkDeleteRequests('SetFileGCWeight', timeOut);
693
END;
Dennis Waldron's avatar
Dennis Waldron committed
694
/
695

696
697
698
699
/* Search and delete old diskCopies in bad states */
CREATE OR REPLACE PROCEDURE deleteFailedDiskCopies(timeOut IN NUMBER) AS
  dcIds "numList";
  cfIds "numList";
700
BEGIN
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
  LOOP
    -- select INVALID diskcopies without filesystem (they can exist after a
    -- stageRm that came before the diskcopy had been created on disk) and ALL FAILED
    -- ones (coming from failed recalls or failed removals from the GC daemon).
    -- Note that we don't select INVALID diskcopies from recreation of files
    -- because they are taken by the standard GC as they physically exist on disk.
    -- go only for 1000 at a time and retry if the limit was reached
    SELECT id
      BULK COLLECT INTO dcIds
      FROM DiskCopy
     WHERE (status = 4 OR (status = 7 AND fileSystem = 0))
       AND creationTime < getTime() - timeOut
       AND ROWNUM <= 1000;
    SELECT /*+ INDEX(DC PK_DiskCopy_ID) */ UNIQUE castorFile
      BULK COLLECT INTO cfIds
      FROM DiskCopy DC
     WHERE id IN (SELECT /*+ CARDINALITY(ids 5) */ * FROM TABLE(dcIds) ids);
    -- drop the DiskCopies
    FORALL i IN 1 .. dcIds.COUNT
      DELETE FROM DiskCopy WHERE id = dcIds(i);
    COMMIT;
    -- maybe delete the CastorFiles if nothing is left for them
    FOR i IN 1 .. cfIds.COUNT LOOP
      deleteCastorFile(cfIds(i));
    END LOOP;
    COMMIT;
    -- exit if we did less than 1000
    IF dcIds.COUNT < 1000 THEN EXIT; END IF;
729
  END LOOP;
730
END;
Dennis Waldron's avatar
Dennis Waldron committed
731
/
732

733
/* Deal with old diskCopies in STAGEOUT */
734
CREATE OR REPLACE PROCEDURE deleteOutOfDateStageOutDCs(timeOut IN NUMBER) AS
735
  srId NUMBER;
736
BEGIN
737
738
  -- Deal with old DiskCopies in STAGEOUT/WAITFS. The rule is to drop
  -- the ones with 0 fileSize and issue a putDone for the others
739
740
  FOR f IN (SELECT /*+ USE_NL(D C S) LEADING(D C S) INDEX(D I_DiskCopy_Status_Open) INDEX(S I_SubRequest_CastorFile) */
                   C.filesize, C.id, C.fileId, C.nsHost, D.fileSystem, D.id AS dcId, D.status AS dcStatus
741
742
743
              FROM DiskCopy D, Castorfile C
             WHERE C.id = D.castorFile
               AND D.creationTime < getTime() - timeOut
744
               AND decode(D.status,6,D.status,decode(D.status,5,D.status,decode(D.status,11,D.status,NULL))) IS NOT NULL
745
               AND NOT EXISTS (
746
747
                 SELECT 'x'
                   FROM SubRequest
748
                  WHERE castorFile = C.id
749
                    AND status IN (0, 1, 2, 3, 5, 6, 13) -- all active
750
                    AND reqType NOT IN (37, 38))) LOOP -- ignore PrepareToPut, PrepareToUpdate
751
    IF (0 = f.fileSize) OR (f.dcStatus <> 6) THEN  -- DISKCOPY_STAGEOUT
752
      -- here we invalidate the diskcopy and let the GC run
753
754
755
756
      UPDATE DiskCopy SET status = 7  -- INVALID
       WHERE id = f.dcid;
      -- and we also fail the correspondent prepareToPut/Update request if it exists
      BEGIN
757
        SELECT /*+ INDEX_RS_ASC(Subrequest I_Subrequest_Diskcopy)*/ id
758
          INTO srId   -- there can only be one outstanding PrepareToPut/Update, if any
759
760
761
762
763
764
          FROM SubRequest
         WHERE status = 6 AND diskCopy = f.dcid;
        archiveSubReq(srId, 9);  -- FAILED_FINISHED
      EXCEPTION WHEN NO_DATA_FOUND THEN
        NULL;
      END;
765
      logToDLF(NULL, dlf.LVL_WARNING, dlf.FILE_DROPPED_BY_CLEANING, f.fileId, f.nsHost, 'stagerd', '');
766
767
    ELSE
      -- here we issue a putDone
768
769
770
      -- context 2 : real putDone. Missing PPut requests are ignored.
      -- svcClass 0 since we don't know it. This will trigger a
      -- default behavior in the putDoneFunc
771
      putDoneFunc(f.id, f.fileSize, 2, 0);
772
      logToDLF(NULL, dlf.LVL_WARNING, dlf.PUTDONE_ENFORCED_BY_CLEANING, f.fileId, f.nsHost, 'stagerd', '');
773
    END IF;
774
  END LOOP;
775
  COMMIT;
776
END;
Dennis Waldron's avatar
Dennis Waldron committed
777
/
778

779
/* Runs cleanup operations */
780
781
CREATE OR REPLACE PROCEDURE cleanup AS
  t INTEGER;
782
BEGIN
783
784
  -- First perform some cleanup of old stuff:
  -- for each, read relevant timeout from configuration table
785
  t := TO_NUMBER(getConfigOption('cleaning', 'outOfDateStageOutDCsTimeout', '72'));
786
  deleteOutOfDateStageOutDCs(t*3600);
787
  t := TO_NUMBER(getConfigOption('cleaning', 'failedDCsTimeout', '72'));
788
  deleteFailedDiskCopies(t*3600);
789
END;
Dennis Waldron's avatar
Dennis Waldron committed
790
/
791

792
793
794
795
796
/*
 * Database jobs
 */
BEGIN
  -- Remove database jobs before recreating them
797
  FOR j IN (SELECT job_name FROM user_scheduler_jobs
798
799
             WHERE job_name IN ('HOUSEKEEPINGJOB',
                                'CLEANUPJOB',
800
                                'BULKCHECKFSBACKINPRODJOB'))
801
  LOOP
802
    DBMS_SCHEDULER.DROP_JOB(j.job_name, TRUE);
803
  END LOOP;
804

805
  -- Create a db job to be run every 20 minutes executing the deleteTerminatedRequests procedure
806
807
808
  DBMS_SCHEDULER.CREATE_JOB(
      JOB_NAME        => 'houseKeepingJob',
      JOB_TYPE        => 'PLSQL_BLOCK',
809
      JOB_ACTION      => 'BEGIN startDbJob(''BEGIN deleteTerminatedRequests(); END;'', ''stagerd''); END;',
810
      JOB_CLASS       => 'CASTOR_JOB_CLASS',
811
      START_DATE      => SYSDATE + 60/1440,
812
813
814
      REPEAT_INTERVAL => 'FREQ=MINUTELY; INTERVAL=20',
      ENABLED         => TRUE,
      COMMENTS        => 'Cleaning of terminated requests');
815

816
  -- Create a db job to be run twice a day executing the cleanup procedure
817
  DBMS_SCHEDULER.CREATE_JOB(
818
      JOB_NAME        => 'cleanupJob',
819
      JOB_TYPE        => 'PLSQL_BLOCK',
820
      JOB_ACTION      => 'BEGIN startDbJob(''BEGIN cleanup(); END;'', ''stagerd''); END;',
821
      JOB_CLASS       => 'CASTOR_JOB_CLASS',
822
      START_DATE      => SYSDATE + 60/1440,
823
      REPEAT_INTERVAL => 'FREQ=HOURLY; INTERVAL=12',
824
      ENABLED         => TRUE,
825
826
      COMMENTS        => 'Database maintenance');

827
  -- Create a db job to be run every 5 minutes executing the bulkCheckFSBackInProd procedure
828
829
830
  DBMS_SCHEDULER.CREATE_JOB(
      JOB_NAME        => 'bulkCheckFSBackInProdJob',
      JOB_TYPE        => 'PLSQL_BLOCK',
831
      JOB_ACTION      => 'BEGIN startDbJob(''BEGIN bulkCheckFSBackInProd(); END;'', ''stagerd''); END;',
832
      JOB_CLASS       => 'CASTOR_JOB_CLASS',
833
      START_DATE      => SYSDATE + 60/1440,
834
835
836
      REPEAT_INTERVAL => 'FREQ=MINUTELY; INTERVAL=5',
      ENABLED         => TRUE,
      COMMENTS        => 'Bulk operation to processing filesystem state changes');
837
END;
Dennis Waldron's avatar
Dennis Waldron committed
838
/
Dennis Waldron's avatar
Dennis Waldron committed
839