GarbageCollector.cpp 34 KB
Newer Older
1
/*
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 * The CERN Tape Archive (CTA) project
 * Copyright (C) 2015  CERN
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

19
#include "GarbageCollector.hpp"
20
#include "AgentReference.hpp"
21
22
23
24
#include "ArchiveRequest.hpp"
#include "RetrieveRequest.hpp"
#include "ArchiveQueue.hpp"
#include "RetrieveQueue.hpp"
25
#include "Helpers.hpp"
26
#include "RootEntry.hpp"
27
#include <algorithm>
28
29
30

namespace cta { namespace objectstore {

31
32
GarbageCollector::GarbageCollector(Backend & os, AgentReference & agentReference, catalogue::Catalogue & catalogue): 
  m_objectStore(os), m_catalogue(catalogue), m_ourAgentReference(agentReference), m_agentRegister(os) {
33
34
35
  RootEntry re(m_objectStore);
  ScopedSharedLock reLock(re);
  re.fetch();
36
  m_agentRegister.setAddress(re.getAgentRegisterAddress());
37
38
39
40
  reLock.release();
  ScopedSharedLock arLock(m_agentRegister);
  m_agentRegister.fetch();
}
41

42
43
44
45
void GarbageCollector::runOnePass(log::LogContext & lc) {
  trimGoneTargets(lc);
  aquireTargets(lc);
  checkHeartbeats(lc);
46
}
47
  
48
void GarbageCollector::trimGoneTargets(log::LogContext & lc) {
49
  m_agentRegister.fetchNoLock();
50
  std::list<std::string> agentList = m_agentRegister.getAgents();
51
52
  // Find the agents we knew about and are not listed anymore.
  // We will just stop looking for them.
53
54
55
56
57
  for (std::map<std::string, AgentWatchdog * >::iterator wa
        = m_watchedAgents.begin();
      wa != m_watchedAgents.end();) {
    if (agentList.end() == std::find(agentList.begin(), agentList.end(), wa->first)) {
      delete wa->second;
58
      log::ScopedParamContainer params(lc);
59
      params.add("agentAddress", wa->first);
60
      m_watchedAgents.erase(wa++);
61
      lc.log(log::INFO, "In GarbageCollector::trimGoneTargets(): removed now gone agent.");
62
63
    } else {
      wa++;
64
65
    }
  }
66
}
67

68
void GarbageCollector::aquireTargets(log::LogContext & lc) {
69
70
  m_agentRegister.fetchNoLock();
  // We will now watch all agents we do not know about yet.
71
  std::list<std::string> candidatesList = m_agentRegister.getUntrackedAgents();
72
73
74
75
76
77
  // Build a set of our own tracked agents.
  std::set<std::string> alreadyTrackedAgents;
  for (auto &ata: m_watchedAgents) {
    alreadyTrackedAgents.insert(ata.first);
  }
  for (auto &c: candidatesList) {
78
    // We don't monitor ourselves
79
    if (c != m_ourAgentReference.getAgentAddress() && !alreadyTrackedAgents.count(c)) {
80
81
82
      // So we have a candidate we might want to monitor
      // First, check that the agent entry exists, and that ownership
      // is indeed pointing to the agent register
83
      Agent ag(c, m_objectStore);
84
      try {
85
86
87
88
89
90
91
        ag.fetchNoLock();
      } catch (...) {
        // The agent could simply be gone... (If not, let the complain go through).
        if (m_objectStore.exists(c)) throw;
        continue;
      }
      if (ag.getOwner() == m_agentRegister.getAddressIfSet()) {
92
      }
93
94
      log::ScopedParamContainer params(lc);
      params.add("agentAddress", ag.getAddressIfSet())
95
            .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
96
      lc.log(log::INFO, "In GarbageCollector::aquireTargets(): started tracking an untracked agent");
97
      // Agent is to be tracked, let's track it.
98
      double timeout=ag.getTimeout();
99
100
101
102
103
104
105
106
107
108
      // The creation of the watchdog could fail as well (if agent gets deleted in the mean time).
      try {
        m_watchedAgents[c] =
          new AgentWatchdog(c, m_objectStore);
        m_watchedAgents[c]->setTimeout(timeout);
      } catch (...) {
        if (m_objectStore.exists(c)) throw;
        m_watchedAgents.erase(c);
        continue;
      }
109
    }
110
  }
111
112
}
 
113
void GarbageCollector::checkHeartbeats(log::LogContext & lc) {
114
115
116
117
118
  // Check the heartbeats of the watched agents
  // We can still fail on many steps
  for (std::map<std::string, AgentWatchdog * >::iterator wa = m_watchedAgents.begin();
      wa != m_watchedAgents.end();) {
    // Get the heartbeat. Clean dead agents and remove references to them
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    try {
      if (!wa->second->checkAlive()) {
        cleanupDeadAgent(wa->first, lc);
        delete wa->second;
        m_watchedAgents.erase(wa++);
      } else {
        wa++;
      }
    } catch (cta::exception::Exception & ex) {
      if (wa->second->checkExists()) {
        // We really have a problem: we failed to check on an agent, that is still present.
        throw;
      } else {
        // The agent is simply gone on the wrong time. It will be trimmed from the list on the next pass.
        wa++;
      }
135
136
137
138
    }
  }
}

Eric Cano's avatar
Eric Cano committed
139
void GarbageCollector::cleanupDeadAgent(const std::string & address, log::LogContext & lc) {
140
141
142
143
144
  // We detected a dead agent. Try and take ownership of it. It could already be owned
  // by another garbage collector.
  // To minimize locking, take a lock on the agent and check its ownership first.
  // We do not need to be defensive about exception here as calling function will
  // deal with them.
Eric Cano's avatar
Eric Cano committed
145
  Agent agent(address, m_objectStore);
146
147
148
149
150
151
152
153
154
155
156
  ScopedExclusiveLock agLock;
  try {
    // The agent could be gone while we try to lock it.
    agLock.lock(agent);
  } catch (Backend::NoSuchObject & ex) {
    log::ScopedParamContainer params(lc);
    params.add("agentAddress", agent.getAddressIfSet())
          .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): agent already deleted when trying to lock it. Skipping it.");
    return;
  }
Eric Cano's avatar
Eric Cano committed
157
158
159
  agent.fetch();
  log::ScopedParamContainer params(lc);
  params.add("agentAddress", agent.getAddressIfSet())
160
        .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
161
162
163
164
165
  if (agent.getOwner() != m_agentRegister.getAddressIfSet()) {
    params.add("agentOwner", agent.getOwner());
    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): skipping agent which is not owned by agent register anymore.");
    // The agent will be removed from our ownership by the calling function: we're done.
    return;
Eric Cano's avatar
Eric Cano committed
166
  }
167
168
169
170
171
172
173
174
175
176
  // Aquire ownership of the agent.
  m_ourAgentReference.addToOwnership(address,m_objectStore);
  agent.setOwner(m_ourAgentReference.getAgentAddress());
  agent.commit();
  // Update the register
  ScopedExclusiveLock arl(m_agentRegister);
  m_agentRegister.fetch();
  m_agentRegister.trackAgent(address);
  m_agentRegister.commit();
  arl.release();
Eric Cano's avatar
Eric Cano committed
177
178
  lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): will cleanup dead agent.");
  // Return all objects owned by the agent to their respective backup owners
179
180
    
  const auto ownedObjectAddresses = agent.getOwnershipList();
181
182
183
  // Parallel fetch (lock free) all the objects to assess their status (check ownership,
  // type and decide to which queue they will go.
  std::list<std::shared_ptr<GenericObject>> ownedObjects;
184
  std::list<std::shared_ptr<GenericObject>> fetchedObjects;
185
  std::map<GenericObject *, std::unique_ptr<GenericObject::AsyncLockfreeFetcher>> ownedObjectsFetchers;
186
187
188
189
190
191
192
  // This will be the list of objects we failed to garbage collect. This means the garbage collection
  // will be partial (looping?).
  std::list<std::string> skippedObjects;
  // This will be the list of objects that are simply gone. We will still need to remove the from the ownership
  // list of agent.
  std::list<std::string> goneObjects;
  // 1 launch the async fetch of all the objects.
193
  for (auto & obj : ownedObjectAddresses) {
194
    // Fetch generic objects
195
    ownedObjects.emplace_back(new GenericObject(obj, m_objectStore));
196
    try {
197
      ownedObjectsFetchers[ownedObjects.back().get()].reset(ownedObjects.back()->asyncLockfreeFetch());
198
199
200
201
202
203
204
    } catch (cta::exception::Exception & ex) {
      // We failed to lauch the fetch. This is unepected (absence of object will come later). We will not be able
      // to garbage collect this object.
      skippedObjects.emplace_back(obj);
      // Cleanup object reference. We will skip on this object. That means the garbage collection will
      // be left incomplete
      ownedObjectsFetchers.erase(ownedObjects.back().get());
205
      ownedObjects.pop_back();
206
207
208
209
210
      // Log the error.
      log::ScopedParamContainer params(lc);
      params.add("objectAddress", obj)
            .add("exceptionMessage", ex.getMessageValue());
      lc.log(log::ERR, "In GarbageCollector::cleanupDeadAgent(): failed to asyncLockfreeFetch(): skipping object. Garbage collection will be incomplete.");
211
212
    }
  }
213
214
215
216
217
218
219
220
221
//    if (ownedObjects.back()->exists()) {
//      ownedObjectsFetchers[ownedObjects.back().get()].reset(ownedObjects.back()->asyncLockfreeFetch());
//    } else {
//      agent.removeFromOwnership(ownedObjects.back()->getAddressIfSet());
//      agent.commit();
//      ownedObjects.pop_back();
//      lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): skipping garbage collection of now gone object.");
//    }
//  }
222
  
223
  // 2 find out the result of the fetches
224
  OwnedObjectSorter ownedObjectSorter;
225
  bool ownershipUdated=false;
226
227
228
229
  using serializers::ArchiveJobStatus;
  std::set<ArchiveJobStatus> inactiveArchiveJobStatuses({ArchiveJobStatus::AJS_Complete, ArchiveJobStatus::AJS_Failed});
  using serializers::RetrieveJobStatus;
  std::set<RetrieveJobStatus> inactiveRetrieveJobStatuses({RetrieveJobStatus::RJS_Complete, RetrieveJobStatus::RJS_Failed});
230
  for (auto & obj : ownedObjects) {
231
232
    log::ScopedParamContainer params2(lc);
    params2.add("objectAddress", obj->getAddressIfSet());
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
    try {
      ownedObjectsFetchers.at(obj.get())->wait();
    } catch (Backend::NoSuchObject & ex) {
      goneObjects.push_back(obj->getAddressIfSet());
      lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): skipping garbage collection of now gone object.");
      ownedObjectsFetchers.erase(obj.get());
      agent.removeFromOwnership(obj->getAddressIfSet());
      ownershipUdated=true;
      continue;
    } catch (cta::exception::Exception & ex) {
      // Again, we have a problem. We will skip the object and have an incomplete GC.
      skippedObjects.push_back(obj->getAddressIfSet());
      params2.add("exceptionMessage", ex.getMessageValue());
      lc.log(log::ERR, "In GarbageCollector::cleanupDeadAgent(): failed to AsyncLockfreeFetch::wait(): skipping object. Garbage collection will be incomplete.");
      ownedObjectsFetchers.erase(obj.get());
      continue;
    }
    // This object passed the cut, we can record it for next round.
251
    ownedObjectsFetchers.erase(obj.get());
252
253
254
255
256
257
258
259
    fetchedObjects.emplace_back(obj);
  }
  // The generic objects we are interested in are now also stored in fetchedObjects.
  ownedObjects.clear();
  // 3 Now decide the fate of each fetched and owned object.
  for (auto & obj: fetchedObjects) {
    log::ScopedParamContainer params2(lc);
    params2.add("objectAddress", obj->getAddressIfSet());
260
    if (obj->getOwner() != agent.getAddressIfSet()) {
261
      // For all object types except ArchiveRequests, this means we do
262
      // not need to deal with this object.
263
264
265
      if (obj->type() == serializers::ArchiveRequest_t) {
        ArchiveRequest ar(*obj);
        for (auto & j:ar.dumpJobs()) if (j.owner == agent.getAddressIfSet()) goto doGCObject;
266
267
268
      } else {
        // Log the owner (except for archiveRequests which can have several owners).
        params2.add("actualOwner", obj->getAddressIfSet());
269
      }
270
      lc.log(log::WARNING, "In GarbageCollector::cleanupDeadAgent(): skipping object which is not owned by this agent");
271
272
      agent.removeFromOwnership(obj->getAddressIfSet());
      ownershipUdated=true;
273
274
275
276
277
278
279
280
281
282
283
      continue;
    }
  doGCObject:
    switch (obj->type()) {
      case serializers::ArchiveRequest_t:
      {
        // We need to find out in which queue or queues the owned job(s)
        // Decision is simple: if the job is owned and active, it needs to be requeued
        // in its destination archive queue.
        // Get hold of an (unlocked) archive request:
        std::shared_ptr<ArchiveRequest> ar(new ArchiveRequest(*obj));
284
        obj.reset();
285
286
287
288
289
290
291
292
293
294
        bool jobRequeued=false;
        for (auto &j: ar->dumpJobs()) {
          if ((j.owner == agent.getAddressIfSet() || ar->getOwner() == agent.getAddressIfSet())
              && !inactiveArchiveJobStatuses.count(j.status)) {
            ownedObjectSorter.archiveQueuesAndRequests[j.tapePool].emplace_back(ar);
            log::ScopedParamContainer params3(lc);
            params3.add("tapepool", j.tapePool)
                   .add("copynb", j.copyNb)
                   .add("fileId", ar->getArchiveFile().archiveFileID);
            lc.log(log::INFO, "Selected archive request for requeueing to tape pool");
295
            jobRequeued=true;
296
297
298
299
300
301
302
303
          }
        }
        if (!jobRequeued) {
          log::ScopedParamContainer params3(lc);
          params3.add("fileId", ar->getArchiveFile().archiveFileID);
          lc.log(log::INFO, "No active archive job to requeue found. Request will remain as-is.");
        }
        break;
304
      }
305
306
307
308
      case serializers::RetrieveRequest_t:
      {
        // We need here to re-determine the best tape (and queue) for the retrieve request.
        std::shared_ptr<RetrieveRequest> rr(new RetrieveRequest(*obj));
309
        obj.reset();
310
311
312
313
314
315
316
317
318
319
        // Get the list of vids for non failed tape files.
        std::set<std::string> candidateVids;
        for (auto & j: rr->dumpJobs()) {
          if (!inactiveRetrieveJobStatuses.count(j.status)) {
            candidateVids.insert(rr->getArchiveFile().tapeFiles.at(j.copyNb).vid);
          }
        }
        if (candidateVids.empty()) {
          log::ScopedParamContainer params3(lc);
          params3.add("fileId", rr->getArchiveFile().archiveFileID);
320
321
322
323
324
325
326
327
          lc.log(log::INFO, "No active retrieve job to requeue found. Marking request for normal GC (and probably deletion).");
          ownedObjectSorter.otherObjects.emplace_back(new GenericObject(rr->getAddressIfSet(), m_objectStore));
          break;
        }
        std::string vid;
        try {
          vid=Helpers::selectBestRetrieveQueue(candidateVids, m_catalogue, m_objectStore);
        } catch (Helpers::NoTapeAvailableForRetrieve & ex) {
328
          log::ScopedParamContainer params3(lc);
329
330
331
332
          params3.add("fileId", rr->getArchiveFile().archiveFileID);
          lc.log(log::INFO, "No available tape found. Marking request for normal GC (and probably deletion).");
          ownedObjectSorter.otherObjects.emplace_back(new GenericObject(rr->getAddressIfSet(), m_objectStore));
          break;
333
        }
334
335
336
337
338
339
340
341
342
343
344
        ownedObjectSorter.retrieveQueuesAndRequests[vid].emplace_back(rr);
        log::ScopedParamContainer params3(lc);
        // Find copyNb for logging
        size_t copyNb = std::numeric_limits<size_t>::max();
        uint64_t fSeq = std::numeric_limits<uint64_t>::max();
        for (auto & tc: rr->getArchiveFile().tapeFiles) { if (tc.second.vid==vid) { copyNb=tc.first; fSeq=tc.second.fSeq; } }
        params3.add("fileId", rr->getArchiveFile().archiveFileID)
               .add("copyNb", copyNb)
               .add("vid", vid)
               .add("fSeq", fSeq);
        lc.log(log::INFO, "Selected vid to be requeued for retrieve request.");
345
346
347
        break;
      }
      default:
348
349
        // For other objects, we will not implement any optimization and simply call
        // their individual garbageCollect method.
350
        ownedObjectSorter.otherObjects.emplace_back(obj);
351
        obj.reset();
352
        break;
353
    }
354
    // We can now get rid of the generic object (data was transferred in a (typed) object in the sorter).
355
  }
356
357
358
  // We are now done with the next container.
  if (ownershipUdated) agent.commit();
  fetchedObjects.clear();
359
  
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
  // We can now start updating the objects efficiently. We still need to re-fetch them locked 
  // and validate ownership.
  // 
  // 1) Get the archive requests done.
  for (auto & tapepool: ownedObjectSorter.archiveQueuesAndRequests) {
    double queueLockFetchTime=0;
    double queuePreparationTime=0;
    double queueCommitTime=0;
    double requestsUpdatePreparationTime=0;
    double requestsUpdatingTime=0;
    double queueRecommitTime=0;
    uint64_t filesQueued=0;
    uint64_t filesDequeued=0;
    uint64_t bytesQueued=0;
    uint64_t bytesDequeued=0;
    uint64_t filesBefore=0;
    uint64_t bytesBefore=0;
    utils::Timer t;
    // Get the archive queue and add references to the jobs in it.
    bool didAddToQueue=false;
    ArchiveQueue aq(m_objectStore);
    ScopedExclusiveLock aql;
    Helpers::getLockedAndFetchedQueue<ArchiveQueue>(aq, aql, m_ourAgentReference, tapepool.first, lc);
    queueLockFetchTime = t.secs(utils::Timer::resetCounter);
    auto jobsSummary=aq.getJobsSummary();
    filesBefore=jobsSummary.files;
    bytesBefore=jobsSummary.bytes;
    // We have the queue. We will loop on the requests, add them to the queue. We will launch their updates 
    // after committing the queue.
    for (auto & ar: tapepool.second) {
      // Determine the copy number and feed the queue with it.
      for (auto &j: ar->dumpJobs()) {
        if (j.tapePool == tapepool.first) {
          if (aq.addJobIfNecessary(j, ar->getAddressIfSet(), ar->getArchiveFile().archiveFileID, 
              ar->getArchiveFile().fileSize, ar->getMountPolicy(), ar->getEntryLog().time)) {
            didAddToQueue = true;
            filesQueued++;
            bytesQueued += ar->getArchiveFile().fileSize;
          }          
        }
      }
    }
    queuePreparationTime = t.secs(utils::Timer::resetCounter);
    // If we have an unexpected failure, we will re-run the individual garbage collection. Before that, 
    // we will NOT remove the object from agent's ownership. This variable is declared a bit ahead so
    // the goto will not cross its initialization.
    std::set<std::string> jobsIndividuallyGCed;
    if (didAddToQueue) {
      aq.commit();
      queueCommitTime = t.secs(utils::Timer::resetCounter);
    } else {
      goto agentCleanupForArchive;
    }
    // We will keep individual references for each job update we launch so that we make
    // no assumption (several jobs could be queued to the same pool, even if not expected
    // at the high level).
    struct ARUpdatersParams {
      std::unique_ptr<ArchiveRequest::AsyncJobOwnerUpdater> updater;
      std::shared_ptr<ArchiveRequest> archiveRequest;
      uint16_t copyNb;
    };
    {
      std::list<ARUpdatersParams> arUpdatersParams;
      for (auto & ar: tapepool.second) {
        for (auto & j: ar->dumpJobs()) {
          if (j.tapePool == tapepool.first) {
            arUpdatersParams.emplace_back();
            arUpdatersParams.back().archiveRequest = ar;
            arUpdatersParams.back().copyNb = j.copyNb;
            arUpdatersParams.back().updater.reset(
              ar->asyncUpdateJobOwner(j.copyNb, aq.getAddressIfSet(), address));
          }
        }
      }
      requestsUpdatePreparationTime = t.secs(utils::Timer::resetCounter);
      // Now collect the results.
      bool aqUpdated=false;
      for (auto & arup: arUpdatersParams) {
        try {
          arup.updater->wait();
          // OK, the job made it to the queue
          log::ScopedParamContainer params(lc);
          params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
                .add("copyNb", arup.copyNb)
                .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
                .add("tapepool", tapepool.first)
                .add("archiveQueueObject", aq.getAddressIfSet())
                .add("garbageCollectedPreviousOwner", agent.getAddressIfSet());
          lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): requeued archive job.");
        } catch (cta::exception::Exception & e) {
          // Update did not go through. It could be benign
          std::string debugType=typeid(e).name();
          if (typeid(e) == typeid(Backend::NoSuchObject)) {
            // The object was not present or not owned during update, so we skip it.
            // This is nevertheless unexpected (from previous fetch, so this is an error).
            log::ScopedParamContainer params(lc);
            params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
                  .add("copyNb", arup.copyNb)
                  .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
                  .add("exceptionType", debugType);
            lc.log(log::ERR, "In GarbageCollector::cleanupDeadAgent(): failed to requeue gone/not owned archive job. Removing from queue.");
          } else {
            // We have an unexpected error. We will handle this with the request-by-request garbage collection.
            log::ScopedParamContainer params(lc);
            params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
                  .add("copyNb", arup.copyNb)
                  .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
                  .add("exceptionType", debugType)
                  .add("exceptionMessage", e.getMessageValue());
            lc.log(log::ERR, "In GarbageCollector::cleanupDeadAgent(): failed to requeue archive job with unexpected error. Removing from queue and will re-run individual garbage collection.");
            // We will re-run the individual GC for this one.
            jobsIndividuallyGCed.insert(arup.archiveRequest->getAddressIfSet());
            ownedObjectSorter.otherObjects.emplace_back(new GenericObject(arup.archiveRequest->getAddressIfSet(), m_objectStore));
          }
          // In all cases, the object did NOT make it to the queue.
          filesDequeued ++;
          bytesDequeued += arup.archiveRequest->getArchiveFile().fileSize;
          aq.removeJob(arup.archiveRequest->getAddressIfSet());
          aqUpdated=true;
        }
      }
      requestsUpdatingTime = t.secs(utils::Timer::resetCounter);
      if (aqUpdated) {
        aq.commit();
        log::ScopedParamContainer params(lc);
        params.add("archiveQueueObject", aq.getAddressIfSet());
        lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): RE-committed archive queue after error handling.");
        queueRecommitTime = t.secs(utils::Timer::resetCounter);
      }
    }
    {
      log::ScopedParamContainer params(lc);
      auto jobsSummary = aq.getJobsSummary();
      params.add("tapepool", tapepool.first)
            .add("archiveQueueObject", aq.getAddressIfSet())
            .add("filesAdded", filesQueued - filesDequeued)
            .add("bytesAdded", bytesQueued - bytesDequeued)
            .add("filesAddedInitially", filesQueued)
            .add("bytesAddedInitially", bytesQueued)
            .add("filesDequeuedAfterErrors", filesDequeued)
            .add("bytesDequeuedAfterErrors", bytesDequeued)
            .add("filesBefore", filesBefore)
            .add("bytesBefore", bytesBefore)
            .add("filesAfter", jobsSummary.files)
            .add("bytesAfter", jobsSummary.bytes)
            .add("queueLockFetchTime", queueLockFetchTime)
            .add("queuePreparationTime", queuePreparationTime)
            .add("queueCommitTime", queueCommitTime)
            .add("requestsUpdatePreparationTime", requestsUpdatePreparationTime)
            .add("requestsUpdatingTime", requestsUpdatingTime)
            .add("queueRecommitTime", queueRecommitTime);
      lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): Requeued a batch of archive requests.");
    }
    // We can now forget pool level list. But before that, we can remove the objects 
    // from agent ownership if this was the last reference to it.
    // The usage of use_count() is safe here because we are in a single threaded environment.
    // In a multi threaded environment, its usage would not be appropriate.
    // See for example http://en.cppreference.com/w/cpp/memory/shared_ptr/use_count
  agentCleanupForArchive:
    bool ownershipUpdated=false;
    for (auto &ar: tapepool.second) {
      if (ar.use_count() == 1 && !jobsIndividuallyGCed.count(ar->getAddressIfSet())) {
        // This tapepool is the last users of this archive request. We will remove is from ownership.
        agent.removeFromOwnership(ar->getAddressIfSet());
        ownershipUpdated=true;
      }
    }
    if (ownershipUpdated) agent.commit();
    tapepool.second.clear();
  }
530
  
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
  // 2) Get the retrieve requests done. They are simpler as retrieve requests are fully owned.
  // Then should hence not have changes since we pre-fetched them.
  for (auto & tape: ownedObjectSorter.retrieveQueuesAndRequests) {
    double queueLockFetchTime=0;
    double queuePreparationTime=0;
    double queueCommitTime=0;
    double requestsUpdatePreparationTime=0;
    double requestsUpdatingTime=0;
    double queueRecommitTime=0;
    uint64_t filesQueued=0;
    uint64_t filesDequeued=0;
    uint64_t bytesQueued=0;
    uint64_t bytesDequeued=0;
    uint64_t filesBefore=0;
    uint64_t bytesBefore=0;
    utils::Timer t;
    // Get the retrieve queue and add references to the jobs to it.
    bool didAddToQueue=false;
    RetrieveQueue rq(m_objectStore);
    ScopedExclusiveLock rql;
    Helpers::getLockedAndFetchedQueue<RetrieveQueue>(rq,rql, m_ourAgentReference, tape.first, lc);
    queueLockFetchTime = t.secs(utils::Timer::resetCounter);
    auto jobsSummary=rq.getJobsSummary();
    filesBefore=jobsSummary.files;
    bytesBefore=jobsSummary.bytes;
    // We have the queue. We will loop on the requests, add them to the queue. We will launch their updates 
    // after committing the queue.
    for (auto & rr: tape.second) {
      // Determine the copy number and feed the queue with it.
      for (auto &tf: rr->getArchiveFile().tapeFiles) {
        if (tf.second.vid == tape.first) {
          if (rq.addJobIfNecessary(tf.second.copyNb, tf.second.fSeq, rr->getAddressIfSet(), rr->getArchiveFile().fileSize, 
              rr->getRetrieveFileQueueCriteria().mountPolicy, rr->getEntryLog().time)) {
            didAddToQueue = true;
            filesQueued++;
            bytesQueued += rr->getArchiveFile().fileSize;
          }          
        }
      }
    }
    queuePreparationTime = t.secs(utils::Timer::resetCounter);
    // If we have an unexpected failure, we will re-run the individual garbage collection. Before that, 
    // we will NOT remove the object from agent's ownership. This variable is declared a bit ahead so
    // the goto will not cross its initialization.
    std::set<std::string> jobsIndividuallyGCed;
    if (didAddToQueue) {
      rq.commit();
      queueCommitTime = t.secs(utils::Timer::resetCounter);
    } else {
      goto agentCleanupForRetrieve;
    }
    // We will keep individual references for each job update we launch so that we make
    // our life easier downstream.
    struct RRUpdatedParams {
      std::unique_ptr<RetrieveRequest::AsyncOwnerUpdater> updater;
      std::shared_ptr<RetrieveRequest> retrieveRequest;
      uint16_t copyNb;
    };
    {
      std::list<RRUpdatedParams> rrUpdatersParams;
      for (auto & rr: tape.second) {
        for (auto & tf: rr->getArchiveFile().tapeFiles) {
          if (tf.second.vid == tape.first) {
            rrUpdatersParams.emplace_back();
            rrUpdatersParams.back().retrieveRequest = rr;
            rrUpdatersParams.back().copyNb = tf.second.copyNb;
            rrUpdatersParams.back().updater.reset(rr->asyncUpdateOwner(tf.second.copyNb,
                rq.getAddressIfSet(), agent.getAddressIfSet()));
          }
        }
      }
      requestsUpdatePreparationTime = t.secs(utils::Timer::resetCounter);
      // Now collect the results.
      bool rqUpdated=false;
      for (auto & rrup: rrUpdatersParams) {
        try {
          rrup.updater->wait();
          // OK, the job made it to the queue
          log::ScopedParamContainer params(lc);
          params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                .add("copyNb", rrup.copyNb)
                .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
                .add("vid", tape.first)
                .add("retreveQueueObject", rq.getAddressIfSet())
                .add("garbageCollectedPreviousOwner", agent.getAddressIfSet());
          lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): requeued retrieve job.");
        } catch (cta::exception::Exception & e) {
         // Update did not go through. It could be benign
          std::string debugType=typeid(e).name();
          if (typeid(e) == typeid(Backend::NoSuchObject) ||
              typeid(e) == typeid(objectstore::ArchiveRequest::WrongPreviousOwner)) {
            // The object was not present or not owned during update, so we skip it.
            // This is nevertheless unexpected (from previous fetch, so this is an error).
            log::ScopedParamContainer params(lc);
            params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                  .add("copyNb", rrup.copyNb)
                  .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
                  .add("exceptionType", debugType);
            lc.log(log::ERR, "In GarbageCollector::cleanupDeadAgent(): failed to requeue gone/not owned retrieve job. Removing from queue.");
          } else {
            // We have an unexpected error. Log it, and remove form queue. Not much we can
            // do at this point.
            log::ScopedParamContainer params(lc);
            params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                  .add("copyNb", rrup.copyNb)
                  .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
                  .add("exceptionType", debugType)
                  .add("exceptionMessage", e.getMessageValue());
            lc.log(log::ERR, "In GarbageCollector::cleanupDeadAgent(): failed to requeue retrieve job with unexpected error. Removing from queue and will re-run individual garbage collection.");
            // We will re-run the individual GC for this one.
            jobsIndividuallyGCed.insert(rrup.retrieveRequest->getAddressIfSet());
            ownedObjectSorter.otherObjects.emplace_back(new GenericObject(rrup.retrieveRequest->getAddressIfSet(), m_objectStore));
          }
          // In all cases, the object did NOT make it to the queue.
          filesDequeued ++;
          bytesDequeued += rrup.retrieveRequest->getArchiveFile().fileSize;
          rq.removeJob(rrup.retrieveRequest->getAddressIfSet());
          rqUpdated=true;
        }
      }
      requestsUpdatingTime = t.secs(utils::Timer::resetCounter);
      if (rqUpdated) {
        rq.commit();
        log::ScopedParamContainer params(lc);
        params.add("retreveQueueObject", rq.getAddressIfSet());
        lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): RE-committed retrieve queue after error handling.");
        queueRecommitTime = t.secs(utils::Timer::resetCounter);
      }
    }
    {
      log::ScopedParamContainer params(lc);
      auto jobsSummary = rq.getJobsSummary();
      params.add("vid", tape.first)
            .add("archiveQueueObject", rq.getAddressIfSet())
            .add("filesAdded", filesQueued - filesDequeued)
            .add("bytesAdded", bytesQueued - bytesDequeued)
            .add("filesAddedInitially", filesQueued)
            .add("bytesAddedInitially", bytesQueued)
            .add("filesDequeuedAfterErrors", filesDequeued)
            .add("bytesDequeuedAfterErrors", bytesDequeued)
            .add("filesBefore", filesBefore)
            .add("bytesBefore", bytesBefore)
            .add("filesAfter", jobsSummary.files)
            .add("bytesAfter", jobsSummary.bytes)
            .add("queueLockFetchTime", queueLockFetchTime)
            .add("queuePreparationTime", queuePreparationTime)
            .add("queueCommitTime", queueCommitTime)
            .add("requestsUpdatePreparationTime", requestsUpdatePreparationTime)
            .add("requestsUpdatingTime", requestsUpdatingTime)
            .add("queueRecommitTime", queueRecommitTime);
      lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): Requeued a batch of retrieve requests.");
    }
    // We can now forget pool level list. But before that, we can remove the objects 
    // from agent ownership if this was the last reference to it.
    // The usage of use_count() is safe here because we are in a single threaded environment.
    // In a multi threaded environment, its usage would not be appropriate.
    // See for example http://en.cppreference.com/w/cpp/memory/shared_ptr/use_count
  agentCleanupForRetrieve:
    bool ownershipUpdated=false;
    for (auto &rr: tape.second) {
      if (rr.use_count() == 1 && !jobsIndividuallyGCed.count(rr->getAddressIfSet())) {
        // This tapepool is the last users of this archive request. We will remove is from ownership.
        agent.removeFromOwnership(rr->getAddressIfSet());
        ownershipUpdated=true;
      }
    }
    if (ownershipUpdated) agent.commit();
    tape.second.clear();
  }
700
  
701
702
703
  // 3) are done with the objects requiring mutualized queueing, and hence special treatement.
  // The rest will be garbage collected on a object-by-object basis.
  for (auto & go : ownedObjectSorter.otherObjects) { 
Eric Cano's avatar
Eric Cano committed
704
705
   // Find the object
   log::ScopedParamContainer params2(lc);
706
   params2.add("objectAddress", go->getAddressIfSet());
Eric Cano's avatar
Eric Cano committed
707
   // If the object does not exist, we're done.
708
709
710
   if (go->exists()) {
     ScopedExclusiveLock goLock(*go);
     go->fetch();
Eric Cano's avatar
Eric Cano committed
711
712
     // Call GenericOpbject's garbage collect method, which in turn will
     // delegate to the object type's garbage collector.
713
     go->garbageCollectDispatcher(goLock, address, m_ourAgentReference, lc, m_catalogue);
Eric Cano's avatar
Eric Cano committed
714
715
716
     lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): garbage collected owned object.");
   } else {
     lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): skipping garbage collection of now gone object.");
717
   }
Eric Cano's avatar
Eric Cano committed
718
   // In all cases, relinquish ownership for this object
719
   agent.removeFromOwnership(go->getAddressIfSet());
Eric Cano's avatar
Eric Cano committed
720
721
722
   agent.commit();
  }
  // We now processed all the owned objects. We can delete the agent's entry
723
  agent.removeAndUnregisterSelf(lc);
Eric Cano's avatar
Eric Cano committed
724
  lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): agent entry removed.");
725
726
  // We can remove the agent from our own ownership.
  m_ourAgentReference.removeFromOwnership(address, m_objectStore);
Eric Cano's avatar
Eric Cano committed
727
728
}

729
}}