GarbageCollector.cpp 39.6 KB
Newer Older
1
/*
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 * The CERN Tape Archive (CTA) project
 * Copyright (C) 2015  CERN
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

19
#include "GarbageCollector.hpp"
20
#include "AgentReference.hpp"
21
22
#include "ArchiveRequest.hpp"
#include "RetrieveRequest.hpp"
23
#include "ArchiveQueueAlgorithms.hpp"
24
#include "RetrieveQueue.hpp"
25
#include "Helpers.hpp"
26
#include "RootEntry.hpp"
27
#include <algorithm>
28
#include <unistd.h>
29
30
31

namespace cta { namespace objectstore {

32
33
GarbageCollector::GarbageCollector(Backend & os, AgentReference & agentReference, catalogue::Catalogue & catalogue): 
  m_objectStore(os), m_catalogue(catalogue), m_ourAgentReference(agentReference), m_agentRegister(os) {
34
35
36
  RootEntry re(m_objectStore);
  ScopedSharedLock reLock(re);
  re.fetch();
37
  m_agentRegister.setAddress(re.getAgentRegisterAddress());
38
39
40
41
  reLock.release();
  ScopedSharedLock arLock(m_agentRegister);
  m_agentRegister.fetch();
}
42

43
44
45
46
47
48
49
50
GarbageCollector::~GarbageCollector(){
  //Normally, the Garbage collector is never destroyed in production
  //this destructor is here to avoid memory leaks on unit tests
  for(auto &kv : m_watchedAgents){
    delete kv.second;
  }
}

51
52
void GarbageCollector::runOnePass(log::LogContext & lc) {
  trimGoneTargets(lc);
53
  acquireTargets(lc);
54
  checkHeartbeats(lc);
55
}
56
  
57
void GarbageCollector::trimGoneTargets(log::LogContext & lc) {
58
  m_agentRegister.fetchNoLock();
59
  std::list<std::string> agentList = m_agentRegister.getAgents();
60
61
  // Find the agents we knew about and are not listed anymore.
  // We will just stop looking for them.
62
63
64
65
66
  for (std::map<std::string, AgentWatchdog * >::iterator wa
        = m_watchedAgents.begin();
      wa != m_watchedAgents.end();) {
    if (agentList.end() == std::find(agentList.begin(), agentList.end(), wa->first)) {
      delete wa->second;
67
      log::ScopedParamContainer params(lc);
68
      params.add("agentAddress", wa->first);
69
      m_watchedAgents.erase(wa++);
70
      lc.log(log::INFO, "In GarbageCollector::trimGoneTargets(): removed now gone agent.");
71
72
    } else {
      wa++;
73
74
    }
  }
75
}
76

Michael Davis's avatar
Michael Davis committed
77
void GarbageCollector::acquireTargets(log::LogContext & lc) {
78
79
  m_agentRegister.fetchNoLock();
  // We will now watch all agents we do not know about yet.
80
  std::list<std::string> candidatesList = m_agentRegister.getUntrackedAgents();
81
82
83
84
85
86
  // Build a set of our own tracked agents.
  std::set<std::string> alreadyTrackedAgents;
  for (auto &ata: m_watchedAgents) {
    alreadyTrackedAgents.insert(ata.first);
  }
  for (auto &c: candidatesList) {
87
    // We don't monitor ourselves
88
    if (c != m_ourAgentReference.getAgentAddress() && !alreadyTrackedAgents.count(c)) {
89
90
91
      // So we have a candidate we might want to monitor
      // First, check that the agent entry exists, and that ownership
      // is indeed pointing to the agent register
92
      Agent ag(c, m_objectStore);
93
      try {
94
        ag.fetchNoLock();
95
96
97
98
      } catch(const Backend::NoSuchObject &){
        //Maybe the agent does has not been fully written in the objectstore backend storage. It will be in the future
        //so continue
        continue;
99
      } catch (...) {
Michael Davis's avatar
Michael Davis committed
100
        // The agent could simply be gone... (if not, let the complaint go through)
101
102
103
104
        if (m_objectStore.exists(c)) throw;
        continue;
      }
      if (ag.getOwner() == m_agentRegister.getAddressIfSet()) {
105
      }
106
107
      log::ScopedParamContainer params(lc);
      params.add("agentAddress", ag.getAddressIfSet())
108
            .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
109
      lc.log(log::INFO, "In GarbageCollector::acquireTargets(): started tracking an untracked agent");
110
      // Agent is to be tracked, let's track it.
111
      double timeout=ag.getTimeout();
Michael Davis's avatar
Michael Davis committed
112
      // The creation of the watchdog could fail as well (if agent gets deleted in the meantime).
113
      try {
114
        m_watchedAgents[c] = new AgentWatchdog(c, m_objectStore);
115
116
117
118
119
120
        m_watchedAgents[c]->setTimeout(timeout);
      } catch (...) {
        if (m_objectStore.exists(c)) throw;
        m_watchedAgents.erase(c);
        continue;
      }
121
    }
122
  }
123
124
}
 
125
void GarbageCollector::checkHeartbeats(log::LogContext & lc) {
126
127
128
129
130
  // Check the heartbeats of the watched agents
  // We can still fail on many steps
  for (std::map<std::string, AgentWatchdog * >::iterator wa = m_watchedAgents.begin();
      wa != m_watchedAgents.end();) {
    // Get the heartbeat. Clean dead agents and remove references to them
131
132
    try {
      if (!wa->second->checkAlive()) {
133
        cleanupDeadAgent(wa->first, wa->second->getDeadAgentDetails(), lc);
134
135
136
137
138
139
140
141
142
143
144
145
146
        delete wa->second;
        m_watchedAgents.erase(wa++);
      } else {
        wa++;
      }
    } catch (cta::exception::Exception & ex) {
      if (wa->second->checkExists()) {
        // We really have a problem: we failed to check on an agent, that is still present.
        throw;
      } else {
        // The agent is simply gone on the wrong time. It will be trimmed from the list on the next pass.
        wa++;
      }
147
148
149
150
    }
  }
}

151
void GarbageCollector::cleanupDeadAgent(const std::string & address, std::list<log::Param> agentDetails, log::LogContext & lc) {
152
153
154
155
156
  // We detected a dead agent. Try and take ownership of it. It could already be owned
  // by another garbage collector.
  // To minimize locking, take a lock on the agent and check its ownership first.
  // We do not need to be defensive about exception here as calling function will
  // deal with them.
Eric Cano's avatar
Eric Cano committed
157
  Agent agent(address, m_objectStore);
158
159
160
161
162
163
164
165
166
167
168
  ScopedExclusiveLock agLock;
  try {
    // The agent could be gone while we try to lock it.
    agLock.lock(agent);
  } catch (Backend::NoSuchObject & ex) {
    log::ScopedParamContainer params(lc);
    params.add("agentAddress", agent.getAddressIfSet())
          .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): agent already deleted when trying to lock it. Skipping it.");
    return;
  }
Eric Cano's avatar
Eric Cano committed
169
170
171
  agent.fetch();
  log::ScopedParamContainer params(lc);
  params.add("agentAddress", agent.getAddressIfSet())
172
        .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
173
174
175
176
177
  if (agent.getOwner() != m_agentRegister.getAddressIfSet()) {
    params.add("agentOwner", agent.getOwner());
    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): skipping agent which is not owned by agent register anymore.");
    // The agent will be removed from our ownership by the calling function: we're done.
    return;
Eric Cano's avatar
Eric Cano committed
178
  }
179
  // Aquire ownership of the agent. Prevent further updates to it.
180
181
  m_ourAgentReference.addToOwnership(address,m_objectStore);
  agent.setOwner(m_ourAgentReference.getAgentAddress());
182
  agent.setBeingGarbageCollected();
183
184
185
186
187
188
189
  agent.commit();
  // Update the register
  ScopedExclusiveLock arl(m_agentRegister);
  m_agentRegister.fetch();
  m_agentRegister.trackAgent(address);
  m_agentRegister.commit();
  arl.release();
190
191
192
193
194
  {
    log::ScopedParamContainer params2(lc);
    for (auto p: agentDetails) params2.add(p.getName(), p.getValue());
    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): will cleanup dead agent.");
  }
Eric Cano's avatar
Eric Cano committed
195
  // Return all objects owned by the agent to their respective backup owners
196
197
198
199
200
201
202
203
204
205
  
  OwnedObjectSorter ownedObjectSorter;
  std::list<std::shared_ptr<GenericObject>> fetchedObjects;
  ownedObjectSorter.fetchOwnedObjects(agent, fetchedObjects, m_objectStore, lc);
  ownedObjectSorter.sortFetchedObjects(agent, fetchedObjects, m_objectStore, m_catalogue, lc);
  ownedObjectSorter.lockFetchAndUpdateArchiveJobs(agent, m_ourAgentReference, m_objectStore, lc);
  ownedObjectSorter.lockFetchAndUpdateRetrieveJobs(agent, m_ourAgentReference, m_objectStore, lc);
  ownedObjectSorter.lockFetchAndUpdateOtherObjects(agent, m_ourAgentReference, m_objectStore, m_catalogue, lc);
}

206
207
void GarbageCollector::OwnedObjectSorter::fetchOwnedObjects(Agent& agent, std::list<std::shared_ptr<GenericObject> >& fetchedObjects,
    Backend & objectStore, log::LogContext & lc) {
208
  const auto ownedObjectAddresses = agent.getOwnershipList();
209
210
211
  // Parallel fetch (lock free) all the objects to assess their status (check ownership,
  // type and decide to which queue they will go.
  std::list<std::shared_ptr<GenericObject>> ownedObjects;
212
  std::map<GenericObject *, std::unique_ptr<GenericObject::AsyncLockfreeFetcher>> ownedObjectsFetchers;
213
214
215
216
217
218
219
  // This will be the list of objects we failed to garbage collect. This means the garbage collection
  // will be partial (looping?).
  std::list<std::string> skippedObjects;
  // This will be the list of objects that are simply gone. We will still need to remove the from the ownership
  // list of agent.
  std::list<std::string> goneObjects;
  // 1 launch the async fetch of all the objects.
220
  for (auto & obj : ownedObjectAddresses) {
221
    // Fetch generic objects
222
    ownedObjects.emplace_back(new GenericObject(obj, objectStore));
223
    try {
224
      ownedObjectsFetchers[ownedObjects.back().get()].reset(ownedObjects.back()->asyncLockfreeFetch());
225
226
227
228
229
230
231
    } catch (cta::exception::Exception & ex) {
      // We failed to lauch the fetch. This is unepected (absence of object will come later). We will not be able
      // to garbage collect this object.
      skippedObjects.emplace_back(obj);
      // Cleanup object reference. We will skip on this object. That means the garbage collection will
      // be left incomplete
      ownedObjectsFetchers.erase(ownedObjects.back().get());
232
      ownedObjects.pop_back();
233
234
235
236
      // Log the error.
      log::ScopedParamContainer params(lc);
      params.add("objectAddress", obj)
            .add("exceptionMessage", ex.getMessageValue());
237
238
      lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::fetchOwnedObjects(): "
          "failed to asyncLockfreeFetch(): skipping object. Garbage collection will be incomplete.");
239
240
    }
  }
241
  
242
243
  // 2 find out the result of the fetches
  bool ownershipUdated=false;
244
  for (auto & obj : ownedObjects) {
245
246
    log::ScopedParamContainer params2(lc);
    params2.add("objectAddress", obj->getAddressIfSet());
247
248
249
250
    try {
      ownedObjectsFetchers.at(obj.get())->wait();
    } catch (Backend::NoSuchObject & ex) {
      goneObjects.push_back(obj->getAddressIfSet());
251
      lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::fetchOwnedObjects(): skipping garbage collection of now gone object.");
252
253
254
255
256
257
258
259
      ownedObjectsFetchers.erase(obj.get());
      agent.removeFromOwnership(obj->getAddressIfSet());
      ownershipUdated=true;
      continue;
    } catch (cta::exception::Exception & ex) {
      // Again, we have a problem. We will skip the object and have an incomplete GC.
      skippedObjects.push_back(obj->getAddressIfSet());
      params2.add("exceptionMessage", ex.getMessageValue());
260
261
      lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::fetchOwnedObjects(): "
          "failed to AsyncLockfreeFetch::wait(): skipping object. Garbage collection will be incomplete.");
262
263
264
265
      ownedObjectsFetchers.erase(obj.get());
      continue;
    }
    // This object passed the cut, we can record it for next round.
266
    ownedObjectsFetchers.erase(obj.get());
267
268
269
270
    fetchedObjects.emplace_back(obj);
  }
  // The generic objects we are interested in are now also stored in fetchedObjects.
  ownedObjects.clear();
271
272
273
  if (ownershipUdated) agent.commit();
}

274
275
void GarbageCollector::OwnedObjectSorter::sortFetchedObjects(Agent& agent, std::list<std::shared_ptr<GenericObject> >& fetchedObjects,
    Backend & objectStore, cta::catalogue::Catalogue & catalogue, log::LogContext & lc) {
276
  // 3 Now decide the fate of each fetched and owned object.
277
278
279
  bool ownershipUdated=false;
  using serializers::ArchiveJobStatus;
  using serializers::RetrieveJobStatus;
280
281
282
  for (auto & obj: fetchedObjects) {
    log::ScopedParamContainer params2(lc);
    params2.add("objectAddress", obj->getAddressIfSet());
283
    if (obj->getOwner() != agent.getAddressIfSet()) {
284
      // For all object types except ArchiveRequests, this means we do
285
      // not need to deal with this object.
286
287
288
      if (obj->type() == serializers::ArchiveRequest_t) {
        ArchiveRequest ar(*obj);
        for (auto & j:ar.dumpJobs()) if (j.owner == agent.getAddressIfSet()) goto doGCObject;
289
290
291
      } else {
        // Log the owner (except for archiveRequests which can have several owners).
        params2.add("actualOwner", obj->getAddressIfSet());
292
      }
293
      lc.log(log::WARNING, "In GarbageCollector::OwnedObjectSorter::sortFetchedObjects(): skipping object which is not owned by this agent");
294
295
      agent.removeFromOwnership(obj->getAddressIfSet());
      ownershipUdated=true;
296
297
298
299
300
301
302
303
304
305
306
      continue;
    }
  doGCObject:
    switch (obj->type()) {
      case serializers::ArchiveRequest_t:
      {
        // We need to find out in which queue or queues the owned job(s)
        // Decision is simple: if the job is owned and active, it needs to be requeued
        // in its destination archive queue.
        // Get hold of an (unlocked) archive request:
        std::shared_ptr<ArchiveRequest> ar(new ArchiveRequest(*obj));
307
        obj.reset();
308
309
        bool jobRequeued=false;
        for (auto &j: ar->dumpJobs()) {
310
311
          if ((j.owner == agent.getAddressIfSet() && ar->c_statusesImplyingQueueing.count(j.status))) {
            std::string containerIdentifier;
312
            try {
313
314
315
316
317
318
              if(ar->c_statusesImplyingQueueingByRepackRequestAddress.count(j.status)){
                containerIdentifier = ar->getRepackInfo().repackRequestAddress;
              } else {
                containerIdentifier = j.tapePool;
              }
              archiveQueuesAndRequests[std::make_tuple(containerIdentifier, ar->getJobQueueType(j.copyNb),j.tapePool)].emplace_back(ar);
319
              log::ScopedParamContainer params3(lc);
320
              params3.add("tapePool", j.tapePool)
321
                     .add("containerIdentifier", containerIdentifier)
322
323
                     .add("copynb", j.copyNb)
                     .add("fileId", ar->getArchiveFile().archiveFileID);
324
              lc.log(log::INFO, "Selected archive request for requeueing to the corresponding queue");
325
              jobRequeued=true;
326
327
328
329
330
331
332
333
334
            } catch (ArchiveRequest::JobNotQueueable &) {
              log::ScopedParamContainer params3(lc);
              params3.add("tapePool", j.tapePool)
                     .add("containerIdentifier", containerIdentifier)
                     .add("copynb", j.copyNb)
                     .add("status",ArchiveRequest::statusToString(j.status))
                     .add("fileId", ar->getArchiveFile().archiveFileID);
              lc.log(log::WARNING, "Job garbage collected with a status not queueable. Leaving it as is.");
            }
335
336
337
338
339
340
341
342
          }
        }
        if (!jobRequeued) {
          log::ScopedParamContainer params3(lc);
          params3.add("fileId", ar->getArchiveFile().archiveFileID);
          lc.log(log::INFO, "No active archive job to requeue found. Request will remain as-is.");
        }
        break;
343
      }
344
345
346
347
      case serializers::RetrieveRequest_t:
      {
        // We need here to re-determine the best tape (and queue) for the retrieve request.
        std::shared_ptr<RetrieveRequest> rr(new RetrieveRequest(*obj));
348
        obj.reset();
349
350
        // Get the list of vids for non failed tape files.
        std::set<std::string> candidateVids;
351
        bool disabledTape = rr->getRepackInfo().forceDisabledTape;
352
        for (auto & j: rr->dumpJobs()) {
353
          if(j.status==RetrieveJobStatus::RJS_ToTransfer) {
354
355
356
357
            for (auto &tf: rr->getArchiveFile().tapeFiles) {
              if ((tf.copyNb == j.copyNb) && (tf.supersededByVid.empty()))
                candidateVids.insert(tf.vid);
            }
358
359
          }
        }
360
        // Small parenthesis for non transfer cases.
361
        if (candidateVids.empty()) {
362
363
          //If the queueType of the RetrieveRequest is FailedJobs or JobsToReportToUser, it needs to be requeued in a queue identified by the vid of the tape
          //If queueType is JobsToReportToRepackForSuccess or JobsToReportToRepackForFailure, it needs to be requeued in a queue identified by the RepackRequest's address
364
          try {
365
366
367
368
369
370
371
372
            std::string vid = rr->getArchiveFile().tapeFiles.begin()->vid;
            if(rr->getQueueType() != JobQueueType::FailedJobs && rr->getQueueType() != JobQueueType::JobsToReportToUser){
              retrieveQueuesAndRequests[std::make_tuple(rr->getRepackInfo().repackRequestAddress, rr->getQueueType(),vid)].emplace_back(rr);
            } else {
              // The request has failed, might need to be added to the failed to report of failed queue/container.
              retrieveQueuesAndRequests[std::make_tuple(vid, rr->getQueueType(),vid)].emplace_back(rr);
            }
            break;
373
374
375
376
377
378
379
380
          } catch (cta::exception::Exception & ex) {
            log::ScopedParamContainer params3(lc);
            params3.add("fileId", rr->getArchiveFile().archiveFileID)
                   .add("exceptionMessage", ex.getMessageValue());
            lc.log(log::ERR, "Failed to determine destination queue for retrieve request. Marking request for normal GC (and probably deletion).");
            otherObjects.emplace_back(new GenericObject(rr->getAddressIfSet(), objectStore));
            break;
          }
381
        }
382
        // Back to the transfer case.
383
384
        std::string vid;
        try {
385
          vid=Helpers::selectBestRetrieveQueue(candidateVids, catalogue, objectStore, disabledTape);
386
        } catch (Helpers::NoTapeAvailableForRetrieve & ex) {
387
          log::ScopedParamContainer params3(lc);
388
389
          params3.add("fileId", rr->getArchiveFile().archiveFileID);
          lc.log(log::INFO, "No available tape found. Marking request for normal GC (and probably deletion).");
390
          otherObjects.emplace_back(new GenericObject(rr->getAddressIfSet(), objectStore));
391
          break;
392
        }
393
        retrieveQueuesAndRequests[std::make_tuple(vid, JobQueueType::JobsToTransferForUser,vid)].emplace_back(rr);
394
395
396
397
        log::ScopedParamContainer params3(lc);
        // Find copyNb for logging
        size_t copyNb = std::numeric_limits<size_t>::max();
        uint64_t fSeq = std::numeric_limits<uint64_t>::max();
398
        for (auto & tc: rr->getArchiveFile().tapeFiles) { if (tc.vid==vid) { copyNb=tc.copyNb; fSeq=tc.fSeq; } }
399
400
        params3.add("fileId", rr->getArchiveFile().archiveFileID)
               .add("copyNb", copyNb)
401
               .add("tapeVid", vid)
402
403
               .add("fSeq", fSeq);
        lc.log(log::INFO, "Selected vid to be requeued for retrieve request.");
404
      }
405
      break;
406
      default:
407
408
        // For other objects, we will not implement any optimization and simply call
        // their individual garbageCollect method.
409
        otherObjects.emplace_back(obj);
410
        obj.reset();
411
        break;
412
    }
413
    // We can now get rid of the generic object (data was transferred in a (typed) object in the sorter).
414
  }
415
416
417
  // We are now done with the next container.
  if (ownershipUdated) agent.commit();
  fetchedObjects.clear();
418
419
}

420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
template<typename ArchiveSpecificQueue>
void GarbageCollector::OwnedObjectSorter::executeArchiveAlgorithm(std::list<std::shared_ptr<ArchiveRequest>> &jobs,std::string &queueAddress, const std::string& containerIdentifier, const std::string& tapepool, 
        std::set<std::string> & jobsIndividuallyGCed, Agent& agent, AgentReference& agentReference, 
        Backend &objectStore, log::LogContext& lc)
{
  typedef ContainerAlgorithms<ArchiveQueue,ArchiveSpecificQueue> AqAlgos;
  AqAlgos aqcl(objectStore, agentReference);
  typename decltype(aqcl)::InsertedElement::list jobsToAdd;
  for (auto & ar: jobs) {
    // Determine the copy number and feed the queue with it.
    for (auto &j: ar->dumpJobs()) {
      if (j.tapePool == tapepool) {
        jobsToAdd.push_back({ar.get(), j.copyNb, ar->getArchiveFile(), ar->getMountPolicy(), cta::nullopt});         
      }
    }
  }
  std::set<std::string> jobsNotRequeued;
  try {
    aqcl.referenceAndSwitchOwnershipIfNecessary(containerIdentifier, agent.getAddressIfSet(), queueAddress, jobsToAdd, lc);
  } catch (typename AqAlgos::OwnershipSwitchFailure & failure) {
    for (auto &failedAR: failure.failedElements) {
      try {
        std::rethrow_exception(failedAR.failure);
      } catch (cta::exception::Exception & e) {
        // Update did not go through. It could be benign
        std::string debugType=typeid(e).name();
        auto & arup=*failedAR.element;
        jobsNotRequeued.insert(arup.archiveRequest->getAddressIfSet());
        if (typeid(e) == typeid(Backend::NoSuchObject) || typeid(e) == typeid(Backend::WrongPreviousOwner)) {
          // The object was not present or not owned during update, so we skip it.
          // This is nevertheless unexpected (from previous fetch, so this is an error).
          log::ScopedParamContainer params(lc);
          params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
                .add("copyNb", arup.copyNb)
                .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
                .add("exceptionType", debugType);
456
457
          int logLevel = typeid(e) == typeid(Backend::NoSuchObject) ? log::WARNING : log::ERR;
          lc.log(logLevel, 
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
              "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(): "
              "failed to requeue gone/not owned archive job. Removed from queue.");
        } else {
          // We have an unexpected error. We will handle this with the request-by-request garbage collection.
          log::ScopedParamContainer params(lc);
          params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
                .add("copyNb", arup.copyNb)
                .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
                .add("exceptionType", debugType)
                .add("exceptionMessage", e.getMessageValue());
          lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(): "
              "failed to requeue archive job with unexpected error. "
              "Removing from queue and will re-run individual garbage collection.");
          // We will re-run the individual GC for this one.
          jobsIndividuallyGCed.insert(arup.archiveRequest->getAddressIfSet());
          otherObjects.emplace_back(new GenericObject(arup.archiveRequest->getAddressIfSet(), objectStore));
        }
      }
    }
  }
  // We can now log individually requeued jobs.
  for (auto & arup: jobsToAdd) {
    if (!jobsNotRequeued.count(arup.archiveRequest->getAddressIfSet())) {
        // OK, the job made it to the queue
        log::ScopedParamContainer params(lc);
        params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
              .add("copyNb", arup.copyNb)
              .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
              .add("tapePool", tapepool)
              .add("archiveQueueObject", queueAddress)
              .add("garbageCollectedPreviousOwner", agent.getAddressIfSet());
        lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(): requeued archive job.");
    }
  }
  jobsToAdd.clear();
}

std::string GarbageCollector::OwnedObjectSorter::dispatchArchiveAlgorithms(std::list<std::shared_ptr<ArchiveRequest>> &jobs,const JobQueueType& jobQueueType, const std::string& containerIdentifier,
        const std::string& tapepool,std::set<std::string> & jobsIndividuallyGCed, 
        Agent& agent, AgentReference& agentReference, Backend & objectstore, log::LogContext &lc) {
  std::string queueAddress;
  switch(jobQueueType){
    case JobQueueType::JobsToTransferForUser:
      executeArchiveAlgorithm<ArchiveQueueToTransferForUser>(jobs,queueAddress,containerIdentifier,tapepool, jobsIndividuallyGCed, agent, agentReference, objectstore, lc);
      break;
    case JobQueueType::JobsToReportToUser:
      executeArchiveAlgorithm<ArchiveQueueToReportForUser>(jobs,queueAddress,containerIdentifier,tapepool, jobsIndividuallyGCed, agent, agentReference, objectstore, lc);
      break;
    case JobQueueType::JobsToTransferForRepack:
      executeArchiveAlgorithm<ArchiveQueueToTransferForRepack>(jobs,queueAddress,containerIdentifier,tapepool, jobsIndividuallyGCed, agent, agentReference, objectstore, lc);
      break;
    case JobQueueType::JobsToReportToRepackForSuccess:
      executeArchiveAlgorithm<ArchiveQueueToReportToRepackForSuccess>(jobs,queueAddress,containerIdentifier,tapepool, jobsIndividuallyGCed, agent, agentReference, objectstore, lc);
      break;
    case JobQueueType::JobsToReportToRepackForFailure:
      executeArchiveAlgorithm<ArchiveQueueToReportToRepackForFailure>(jobs,queueAddress,containerIdentifier,tapepool, jobsIndividuallyGCed, agent, agentReference, objectstore, lc);
      break;
    case JobQueueType::FailedJobs:
      executeArchiveAlgorithm<ArchiveQueueFailed>(jobs,queueAddress,containerIdentifier,tapepool, jobsIndividuallyGCed, agent, agentReference, objectstore, lc);
      break;
    default:
      break;
  }
  return queueAddress;
}

524
525
//TODO : We should record the VID in the ArchiveRequest object to allow the requeueing in the proper report queue (currently, the report queue is selected
//by tapepool, which works but is not the most efficient way to report the request (contention problem)
526
527
void GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(Agent& agent, AgentReference& agentReference, Backend & objectStore,
    log::LogContext & lc) {
528
529
530
531
  // We can now start updating the objects efficiently. We still need to re-fetch them locked 
  // and validate ownership.
  // 
  // 1) Get the archive requests done.
532
  for (auto & archiveQueueIdAndReqs: archiveQueuesAndRequests) {
533
534
535
    // The number of objects to requeue could be very high. In order to limit the time taken by the
    // individual requeue operations, we limit the number of concurrently requeued objects to an 
    // arbitrary 500.
536
    std::string containerIdentifier;
537
    std::string tapepool;
538
    JobQueueType queueType;
539
    std::tie(containerIdentifier, queueType, tapepool) = archiveQueueIdAndReqs.first;
540
541
542
543
544
545
    auto & requestsList = archiveQueueIdAndReqs.second;
    while (requestsList.size()) {
      decltype (archiveQueueIdAndReqs.second) currentJobBatch;
      while (requestsList.size() && currentJobBatch.size() <= 500) {
        currentJobBatch.emplace_back(std::move(requestsList.front()));
        requestsList.pop_front();
546
      }
547
      std::set<std::string> jobsIndividuallyGCed;
548
549
550
551
      utils::Timer t;
      //Dispatch the archive algorithms
      dispatchArchiveAlgorithms(currentJobBatch,queueType,containerIdentifier,tapepool,jobsIndividuallyGCed,agent,agentReference,objectStore,lc);
      
552
553
554
555
556
557
      // We can now forget pool level list. But before that, we can remove the objects 
      // from agent ownership if this was the last reference to it.
      // The usage of use_count() is safe here because we are in a single threaded environment.
      // In a multi threaded environment, its usage would not be appropriate.
      // See for example http://en.cppreference.com/w/cpp/memory/shared_ptr/use_count
      bool ownershipUpdated=false;
558
      auto agentOwnership=agent.getOwnershipSet();
559
560
      for (auto &ar: currentJobBatch) {
        if (ar.use_count() == 1 && !jobsIndividuallyGCed.count(ar->getAddressIfSet())) {
561
562
          // This tapepool is the last users of this archive request. We will remove it from ownership.
          agentOwnership.erase(ar->getAddressIfSet());
563
          ownershipUpdated=true;
564
565
566
567
568
569
570
571
572
          log::ScopedParamContainer params(lc);
          params.add("archiveRequestObject", ar->getAddressIfSet());
          lc.log(log::DEBUG, "Removed AR from agent ownership.");
        } else {
          log::ScopedParamContainer params(lc);
          params.add("archiveRequestObject", ar->getAddressIfSet())
                .add("use_count", ar.use_count())
                .add("IndividuallyGCed", jobsIndividuallyGCed.count(ar->getAddressIfSet()));
          lc.log(log::DEBUG, "Did not remove AR from agent ownership.");          
573
        }
574
      }
575
576
577
578
      if (ownershipUpdated) {
        agent.resetOwnership(agentOwnership);
        agent.commit();
      }
579
580
      currentJobBatch.clear();
      // Sleep a bit if we have oher rounds to go not to hog the queue
581
      if (archiveQueueIdAndReqs.second.size()) sleep (5);
582
583
    }
  }
584
585
}

586
587
void GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(Agent& agent, AgentReference& agentReference,
    Backend & objectStore, log::LogContext & lc) {  
588
589
  // 2) Get the retrieve requests done. They are simpler as retrieve requests are fully owned.
  // Then should hence not have changes since we pre-fetched them.
590
  for (auto & retriveQueueIdAndReqs: retrieveQueuesAndRequests) {
591
    std::string containerIdentifier;
592
    JobQueueType queueType;
593
594
    std::string vid;
    std::tie(containerIdentifier, queueType, vid) = retriveQueueIdAndReqs.first;
595
596
597
598
599
600
    auto & requestsList = retriveQueueIdAndReqs.second;
    while (requestsList.size()) {
      decltype (retriveQueueIdAndReqs.second) currentJobBatch;
      while (requestsList.size() && currentJobBatch.size() <= 500) {
        currentJobBatch.emplace_back(std::move(requestsList.front()));
        requestsList.pop_front();
601
      }
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
      double queueLockFetchTime=0;
      double queueProcessAndCommitTime=0;
      double requestsUpdatePreparationTime=0;
      double requestsUpdatingTime=0;
      double queueRecommitTime=0;
      uint64_t filesQueued=0;
      uint64_t filesDequeued=0;
      uint64_t bytesQueued=0;
      uint64_t bytesDequeued=0;
      uint64_t filesBefore=0;
      uint64_t bytesBefore=0;
      utils::Timer t;
      // Get the retrieve queue and add references to the jobs to it.
      RetrieveQueue rq(objectStore);
      ScopedExclusiveLock rql;
617
      Helpers::getLockedAndFetchedJobQueue<RetrieveQueue>(rq,rql, agentReference, containerIdentifier, queueType, lc);
618
619
      queueLockFetchTime = t.secs(utils::Timer::resetCounter);
      auto jobsSummary=rq.getJobsSummary();
620
      filesBefore=jobsSummary.jobs;
621
622
623
624
625
626
627
628
      bytesBefore=jobsSummary.bytes;
      // Prepare the list of requests to add to the queue (if needed).
      std::list<RetrieveQueue::JobToAdd> jta;
      // We have the queue. We will loop on the requests, add them to the list. We will launch their updates 
      // after committing the queue.
      for (auto & rr: currentJobBatch) {
        // Determine the copy number and feed the queue with it.
        for (auto &tf: rr->getArchiveFile().tapeFiles) {
629
            if (tf.vid == vid) {
630
            jta.push_back({tf.copyNb, tf.fSeq, rr->getAddressIfSet(), rr->getArchiveFile().fileSize, 
631
                rr->getRetrieveFileQueueCriteria().mountPolicy, rr->getEntryLog().time, rr->getActivity(), rr->getDiskSystemName()});
632
633
634
          }
        }
      }
635
636
637
638
639
640
641
642
643
644
645
646
      auto addedJobs = rq.addJobsIfNecessaryAndCommit(jta, agentReference, lc);
      queueProcessAndCommitTime = t.secs(utils::Timer::resetCounter);
      // If we have an unexpected failure, we will re-run the individual garbage collection. Before that, 
      // we will NOT remove the object from agent's ownership. This variable is declared a bit ahead so
      // the goto will not cross its initialization.
      std::set<std::string> jobsIndividuallyGCed;
      if (!addedJobs.files) {
        goto agentCleanupForRetrieve;
      }
      // We will keep individual references for each job update we launch so that we make
      // our life easier downstream.
      struct RRUpdatedParams {
647
        std::unique_ptr<RetrieveRequest::AsyncJobOwnerUpdater> updater;
648
        std::shared_ptr<RetrieveRequest> retrieveRequest;
649
        uint32_t copyNb;
650
651
652
653
654
      };
      {
        std::list<RRUpdatedParams> rrUpdatersParams;
        for (auto & rr: currentJobBatch) {
          for (auto & tf: rr->getArchiveFile().tapeFiles) {
655
            if (tf.vid == vid) {
656
657
              rrUpdatersParams.emplace_back();
              rrUpdatersParams.back().retrieveRequest = rr;
658
659
              rrUpdatersParams.back().copyNb = tf.copyNb;
              rrUpdatersParams.back().updater.reset(rr->asyncUpdateJobOwner(tf.copyNb,
660
661
662
663
664
665
666
667
668
669
670
                  rq.getAddressIfSet(), agent.getAddressIfSet()));
            }
          }
        }
        requestsUpdatePreparationTime = t.secs(utils::Timer::resetCounter);
        // Now collect the results.
        std::list<std::string> requestsToDequeue;
        for (auto & rrup: rrUpdatersParams) {
          try {
            rrup.updater->wait();
            // OK, the job made it to the queue
671
672
673
674
            log::ScopedParamContainer params(lc);
            params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                  .add("copyNb", rrup.copyNb)
                  .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
675
                  .add("tapeVid", vid)
676
                  .add("retrieveQueueObject", rq.getAddressIfSet())
677
                  .add("garbageCollectedPreviousOwner", agent.getAddressIfSet());
678
            lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): requeued retrieve job.");
679
680
681
682
          } catch (cta::exception::Exception & e) {
           // Update did not go through. It could be benign
            std::string debugType=typeid(e).name();
            if (typeid(e) == typeid(Backend::NoSuchObject) ||
683
                typeid(e) == typeid(Backend::WrongPreviousOwner)) {
684
685
686
687
688
689
690
              // The object was not present or not owned during update, so we skip it.
              // This is nevertheless unexpected (from previous fetch, so this is an error).
              log::ScopedParamContainer params(lc);
              params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                    .add("copyNb", rrup.copyNb)
                    .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
                    .add("exceptionType", debugType);
691
692
              lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): "
                  "failed to requeue gone/not owned retrieve job. Removing from queue.");
693
694
695
696
697
698
699
700
701
            } else {
              // We have an unexpected error. Log it, and remove form queue. Not much we can
              // do at this point.
              log::ScopedParamContainer params(lc);
              params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                    .add("copyNb", rrup.copyNb)
                    .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
                    .add("exceptionType", debugType)
                    .add("exceptionMessage", e.getMessageValue());
702
703
              lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): "
                  "failed to requeue retrieve job with unexpected error. Removing from queue and will re-run individual garbage collection.");
704
705
706
707
708
709
710
711
              // We will re-run the individual GC for this one.
              jobsIndividuallyGCed.insert(rrup.retrieveRequest->getAddressIfSet());
              otherObjects.emplace_back(new GenericObject(rrup.retrieveRequest->getAddressIfSet(), objectStore));
            }
            // In all cases, the object did NOT make it to the queue.
            filesDequeued ++;
            bytesDequeued += rrup.retrieveRequest->getArchiveFile().fileSize;
            requestsToDequeue.push_back(rrup.retrieveRequest->getAddressIfSet());
712
          }
713
714
715
716
717
718
        }
        requestsUpdatingTime = t.secs(utils::Timer::resetCounter);
        if (requestsToDequeue.size()) {
          rq.removeJobsAndCommit(requestsToDequeue);
          log::ScopedParamContainer params(lc);
          params.add("retreveQueueObject", rq.getAddressIfSet());
719
720
          lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): "
              "Cleaned up and re-committed retrieve queue after error handling.");
721
          queueRecommitTime = t.secs(utils::Timer::resetCounter);
722
723
        }
      }
724
      {
725
        log::ScopedParamContainer params(lc);
726
        auto jobsSummary = rq.getJobsSummary();
727
        params.add("tapeVid", vid)
728
729
730
731
732
733
734
735
736
              .add("retrieveQueueObject", rq.getAddressIfSet())
              .add("filesAdded", filesQueued - filesDequeued)
              .add("bytesAdded", bytesQueued - bytesDequeued)
              .add("filesAddedInitially", filesQueued)
              .add("bytesAddedInitially", bytesQueued)
              .add("filesDequeuedAfterErrors", filesDequeued)
              .add("bytesDequeuedAfterErrors", bytesDequeued)
              .add("filesBefore", filesBefore)
              .add("bytesBefore", bytesBefore)
737
              .add("filesAfter", jobsSummary.jobs)
738
739
740
741
742
743
              .add("bytesAfter", jobsSummary.bytes)
              .add("queueLockFetchTime", queueLockFetchTime)
              .add("queuePreparationTime", queueProcessAndCommitTime)
              .add("requestsUpdatePreparationTime", requestsUpdatePreparationTime)
              .add("requestsUpdatingTime", requestsUpdatingTime)
              .add("queueRecommitTime", queueRecommitTime);
744
745
        lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): "
            "Requeued a batch of retrieve requests.");
746
      }
747
748
749
750
751
752
753
754
755
756
757
758
759
      // We can now forget pool level list. But before that, we can remove the objects 
      // from agent ownership if this was the last reference to it.
      // The usage of use_count() is safe here because we are in a single threaded environment.
      // In a multi threaded environment, its usage would not be appropriate.
      // See for example http://en.cppreference.com/w/cpp/memory/shared_ptr/use_count
    agentCleanupForRetrieve:
      bool ownershipUpdated=false;
      for (auto &rr: currentJobBatch) {
        if (rr.use_count() == 1 && !jobsIndividuallyGCed.count(rr->getAddressIfSet())) {
          // This tapepool is the last users of this archive request. We will remove is from ownership.
          agent.removeFromOwnership(rr->getAddressIfSet());
          ownershipUpdated=true;
        }
760
      }
761
762
763
      if (ownershipUpdated) agent.commit();
      currentJobBatch.clear();
      // Sleep a bit if we have oher rounds to go not to hog the queue
764
      if (retriveQueueIdAndReqs.second.size()) sleep (5);
765
766
    }
  }
767
768
}

769
770
void GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateOtherObjects(Agent& agent, AgentReference& agentReference,
    Backend & objectStore, cta::catalogue::Catalogue & catalogue, log::LogContext & lc) {  
771
772
  // 3) are done with the objects requiring mutualized queueing, and hence special treatement.
  // The rest will be garbage collected on a object-by-object basis.
773
  for (auto & go : otherObjects) { 
Eric Cano's avatar
Eric Cano committed
774
775
   // Find the object
   log::ScopedParamContainer params2(lc);
776
   params2.add("objectAddress", go->getAddressIfSet());
Eric Cano's avatar
Eric Cano committed
777
   // If the object does not exist, we're done.
778
779
780
   if (go->exists()) {
     ScopedExclusiveLock goLock(*go);
     go->fetch();
Eric Cano's avatar
Eric Cano committed
781
782
     // Call GenericOpbject's garbage collect method, which in turn will
     // delegate to the object type's garbage collector.
783
     go->garbageCollectDispatcher(goLock, agent.getAddressIfSet(), agentReference, lc, catalogue);
784
     lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateOtherObjects(): garbage collected owned object.");
Eric Cano's avatar
Eric Cano committed
785
   } else {
786
787
     lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateOtherObjects(): "
         "skipping garbage collection of now gone object.");
788
   }
Eric Cano's avatar
Eric Cano committed
789
   // In all cases, relinquish ownership for this object
790
   agent.removeFromOwnership(go->getAddressIfSet());
Eric Cano's avatar
Eric Cano committed
791
792
793
   agent.commit();
  }
  // We now processed all the owned objects. We can delete the agent's entry
794
  agent.removeAndUnregisterSelf(lc);
Eric Cano's avatar
Eric Cano committed
795
  lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): agent entry removed.");
796
  // We can remove the agent from our own ownership.
797
  agentReference.removeFromOwnership(agent.getAddressIfSet(), objectStore);
Eric Cano's avatar
Eric Cano committed
798
799
}

800
}}