GarbageCollector.cpp 37 KB
Newer Older
1
/*
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 * The CERN Tape Archive (CTA) project
 * Copyright (C) 2015  CERN
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

19
#include "GarbageCollector.hpp"
20
#include "AgentReference.hpp"
21
22
23
24
#include "ArchiveRequest.hpp"
#include "RetrieveRequest.hpp"
#include "ArchiveQueue.hpp"
#include "RetrieveQueue.hpp"
25
#include "Helpers.hpp"
26
#include "RootEntry.hpp"
27
#include <algorithm>
28
#include <unistd.h>
29
30
31

namespace cta { namespace objectstore {

32
33
GarbageCollector::GarbageCollector(Backend & os, AgentReference & agentReference, catalogue::Catalogue & catalogue): 
  m_objectStore(os), m_catalogue(catalogue), m_ourAgentReference(agentReference), m_agentRegister(os) {
34
35
36
  RootEntry re(m_objectStore);
  ScopedSharedLock reLock(re);
  re.fetch();
37
  m_agentRegister.setAddress(re.getAgentRegisterAddress());
38
39
40
41
  reLock.release();
  ScopedSharedLock arLock(m_agentRegister);
  m_agentRegister.fetch();
}
42

43
44
45
46
void GarbageCollector::runOnePass(log::LogContext & lc) {
  trimGoneTargets(lc);
  aquireTargets(lc);
  checkHeartbeats(lc);
47
}
48
  
49
void GarbageCollector::trimGoneTargets(log::LogContext & lc) {
50
  m_agentRegister.fetchNoLock();
51
  std::list<std::string> agentList = m_agentRegister.getAgents();
52
53
  // Find the agents we knew about and are not listed anymore.
  // We will just stop looking for them.
54
55
56
57
58
  for (std::map<std::string, AgentWatchdog * >::iterator wa
        = m_watchedAgents.begin();
      wa != m_watchedAgents.end();) {
    if (agentList.end() == std::find(agentList.begin(), agentList.end(), wa->first)) {
      delete wa->second;
59
      log::ScopedParamContainer params(lc);
60
      params.add("agentAddress", wa->first);
61
      m_watchedAgents.erase(wa++);
62
      lc.log(log::INFO, "In GarbageCollector::trimGoneTargets(): removed now gone agent.");
63
64
    } else {
      wa++;
65
66
    }
  }
67
}
68

69
void GarbageCollector::aquireTargets(log::LogContext & lc) {
70
71
  m_agentRegister.fetchNoLock();
  // We will now watch all agents we do not know about yet.
72
  std::list<std::string> candidatesList = m_agentRegister.getUntrackedAgents();
73
74
75
76
77
78
  // Build a set of our own tracked agents.
  std::set<std::string> alreadyTrackedAgents;
  for (auto &ata: m_watchedAgents) {
    alreadyTrackedAgents.insert(ata.first);
  }
  for (auto &c: candidatesList) {
79
    // We don't monitor ourselves
80
    if (c != m_ourAgentReference.getAgentAddress() && !alreadyTrackedAgents.count(c)) {
81
82
83
      // So we have a candidate we might want to monitor
      // First, check that the agent entry exists, and that ownership
      // is indeed pointing to the agent register
84
      Agent ag(c, m_objectStore);
85
      try {
86
87
88
89
90
91
92
        ag.fetchNoLock();
      } catch (...) {
        // The agent could simply be gone... (If not, let the complain go through).
        if (m_objectStore.exists(c)) throw;
        continue;
      }
      if (ag.getOwner() == m_agentRegister.getAddressIfSet()) {
93
      }
94
95
      log::ScopedParamContainer params(lc);
      params.add("agentAddress", ag.getAddressIfSet())
96
            .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
97
      lc.log(log::INFO, "In GarbageCollector::aquireTargets(): started tracking an untracked agent");
98
      // Agent is to be tracked, let's track it.
99
      double timeout=ag.getTimeout();
100
101
102
103
104
105
106
107
108
109
      // The creation of the watchdog could fail as well (if agent gets deleted in the mean time).
      try {
        m_watchedAgents[c] =
          new AgentWatchdog(c, m_objectStore);
        m_watchedAgents[c]->setTimeout(timeout);
      } catch (...) {
        if (m_objectStore.exists(c)) throw;
        m_watchedAgents.erase(c);
        continue;
      }
110
    }
111
  }
112
113
}
 
114
void GarbageCollector::checkHeartbeats(log::LogContext & lc) {
115
116
117
118
119
  // Check the heartbeats of the watched agents
  // We can still fail on many steps
  for (std::map<std::string, AgentWatchdog * >::iterator wa = m_watchedAgents.begin();
      wa != m_watchedAgents.end();) {
    // Get the heartbeat. Clean dead agents and remove references to them
120
121
    try {
      if (!wa->second->checkAlive()) {
122
        cleanupDeadAgent(wa->first, wa->second->getDeadAgentDetails(), lc);
123
124
125
126
127
128
129
130
131
132
133
134
135
        delete wa->second;
        m_watchedAgents.erase(wa++);
      } else {
        wa++;
      }
    } catch (cta::exception::Exception & ex) {
      if (wa->second->checkExists()) {
        // We really have a problem: we failed to check on an agent, that is still present.
        throw;
      } else {
        // The agent is simply gone on the wrong time. It will be trimmed from the list on the next pass.
        wa++;
      }
136
137
138
139
    }
  }
}

140
void GarbageCollector::cleanupDeadAgent(const std::string & address, std::list<log::Param> agentDetails, log::LogContext & lc) {
141
142
143
144
145
  // We detected a dead agent. Try and take ownership of it. It could already be owned
  // by another garbage collector.
  // To minimize locking, take a lock on the agent and check its ownership first.
  // We do not need to be defensive about exception here as calling function will
  // deal with them.
Eric Cano's avatar
Eric Cano committed
146
  Agent agent(address, m_objectStore);
147
148
149
150
151
152
153
154
155
156
157
  ScopedExclusiveLock agLock;
  try {
    // The agent could be gone while we try to lock it.
    agLock.lock(agent);
  } catch (Backend::NoSuchObject & ex) {
    log::ScopedParamContainer params(lc);
    params.add("agentAddress", agent.getAddressIfSet())
          .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): agent already deleted when trying to lock it. Skipping it.");
    return;
  }
Eric Cano's avatar
Eric Cano committed
158
159
160
  agent.fetch();
  log::ScopedParamContainer params(lc);
  params.add("agentAddress", agent.getAddressIfSet())
161
        .add("gcAgentAddress", m_ourAgentReference.getAgentAddress());
162
163
164
165
166
  if (agent.getOwner() != m_agentRegister.getAddressIfSet()) {
    params.add("agentOwner", agent.getOwner());
    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): skipping agent which is not owned by agent register anymore.");
    // The agent will be removed from our ownership by the calling function: we're done.
    return;
Eric Cano's avatar
Eric Cano committed
167
  }
168
169
170
171
172
173
174
175
176
177
  // Aquire ownership of the agent.
  m_ourAgentReference.addToOwnership(address,m_objectStore);
  agent.setOwner(m_ourAgentReference.getAgentAddress());
  agent.commit();
  // Update the register
  ScopedExclusiveLock arl(m_agentRegister);
  m_agentRegister.fetch();
  m_agentRegister.trackAgent(address);
  m_agentRegister.commit();
  arl.release();
178
179
180
181
182
  {
    log::ScopedParamContainer params2(lc);
    for (auto p: agentDetails) params2.add(p.getName(), p.getValue());
    lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): will cleanup dead agent.");
  }
Eric Cano's avatar
Eric Cano committed
183
  // Return all objects owned by the agent to their respective backup owners
184
185
186
187
188
189
190
191
192
193
194
  
  OwnedObjectSorter ownedObjectSorter;
  std::list<std::shared_ptr<GenericObject>> fetchedObjects;
  ownedObjectSorter.fetchOwnedObjects(agent, fetchedObjects, m_objectStore, lc);
  ownedObjectSorter.sortFetchedObjects(agent, fetchedObjects, m_objectStore, m_catalogue, lc);
  ownedObjectSorter.lockFetchAndUpdateArchiveJobs(agent, m_ourAgentReference, m_objectStore, lc);
  ownedObjectSorter.lockFetchAndUpdateRetrieveJobs(agent, m_ourAgentReference, m_objectStore, lc);
  ownedObjectSorter.lockFetchAndUpdateOtherObjects(agent, m_ourAgentReference, m_objectStore, m_catalogue, lc);
}

void GarbageCollector::OwnedObjectSorter::fetchOwnedObjects(Agent& agent, std::list<std::shared_ptr<GenericObject> >& fetchedObjects, Backend & objectStore, log::LogContext & lc) {
195
  const auto ownedObjectAddresses = agent.getOwnershipList();
196
197
198
  // Parallel fetch (lock free) all the objects to assess their status (check ownership,
  // type and decide to which queue they will go.
  std::list<std::shared_ptr<GenericObject>> ownedObjects;
199
  std::map<GenericObject *, std::unique_ptr<GenericObject::AsyncLockfreeFetcher>> ownedObjectsFetchers;
200
201
202
203
204
205
206
  // This will be the list of objects we failed to garbage collect. This means the garbage collection
  // will be partial (looping?).
  std::list<std::string> skippedObjects;
  // This will be the list of objects that are simply gone. We will still need to remove the from the ownership
  // list of agent.
  std::list<std::string> goneObjects;
  // 1 launch the async fetch of all the objects.
207
  for (auto & obj : ownedObjectAddresses) {
208
    // Fetch generic objects
209
    ownedObjects.emplace_back(new GenericObject(obj, objectStore));
210
    try {
211
      ownedObjectsFetchers[ownedObjects.back().get()].reset(ownedObjects.back()->asyncLockfreeFetch());
212
213
214
215
216
217
218
    } catch (cta::exception::Exception & ex) {
      // We failed to lauch the fetch. This is unepected (absence of object will come later). We will not be able
      // to garbage collect this object.
      skippedObjects.emplace_back(obj);
      // Cleanup object reference. We will skip on this object. That means the garbage collection will
      // be left incomplete
      ownedObjectsFetchers.erase(ownedObjects.back().get());
219
      ownedObjects.pop_back();
220
221
222
223
      // Log the error.
      log::ScopedParamContainer params(lc);
      params.add("objectAddress", obj)
            .add("exceptionMessage", ex.getMessageValue());
224
      lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::fetchOwnedObjects(): failed to asyncLockfreeFetch(): skipping object. Garbage collection will be incomplete.");
225
226
    }
  }
227
  
228
229
  // 2 find out the result of the fetches
  bool ownershipUdated=false;
230
  for (auto & obj : ownedObjects) {
231
232
    log::ScopedParamContainer params2(lc);
    params2.add("objectAddress", obj->getAddressIfSet());
233
234
235
236
    try {
      ownedObjectsFetchers.at(obj.get())->wait();
    } catch (Backend::NoSuchObject & ex) {
      goneObjects.push_back(obj->getAddressIfSet());
237
      lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::fetchOwnedObjects(): skipping garbage collection of now gone object.");
238
239
240
241
242
243
244
245
      ownedObjectsFetchers.erase(obj.get());
      agent.removeFromOwnership(obj->getAddressIfSet());
      ownershipUdated=true;
      continue;
    } catch (cta::exception::Exception & ex) {
      // Again, we have a problem. We will skip the object and have an incomplete GC.
      skippedObjects.push_back(obj->getAddressIfSet());
      params2.add("exceptionMessage", ex.getMessageValue());
246
      lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::fetchOwnedObjects(): failed to AsyncLockfreeFetch::wait(): skipping object. Garbage collection will be incomplete.");
247
248
249
250
      ownedObjectsFetchers.erase(obj.get());
      continue;
    }
    // This object passed the cut, we can record it for next round.
251
    ownedObjectsFetchers.erase(obj.get());
252
253
254
255
    fetchedObjects.emplace_back(obj);
  }
  // The generic objects we are interested in are now also stored in fetchedObjects.
  ownedObjects.clear();
256
257
258
259
  if (ownershipUdated) agent.commit();
}

void GarbageCollector::OwnedObjectSorter::sortFetchedObjects(Agent& agent, std::list<std::shared_ptr<GenericObject> >& fetchedObjects, Backend & objectStore, cta::catalogue::Catalogue & catalogue, log::LogContext & lc) {
260
  // 3 Now decide the fate of each fetched and owned object.
261
262
263
264
265
  bool ownershipUdated=false;
  using serializers::ArchiveJobStatus;
  std::set<ArchiveJobStatus> inactiveArchiveJobStatuses({ArchiveJobStatus::AJS_Complete, ArchiveJobStatus::AJS_Failed});
  using serializers::RetrieveJobStatus;
  std::set<RetrieveJobStatus> inactiveRetrieveJobStatuses({RetrieveJobStatus::RJS_Complete, RetrieveJobStatus::RJS_Failed});
266
267
268
  for (auto & obj: fetchedObjects) {
    log::ScopedParamContainer params2(lc);
    params2.add("objectAddress", obj->getAddressIfSet());
269
    if (obj->getOwner() != agent.getAddressIfSet()) {
270
      // For all object types except ArchiveRequests, this means we do
271
      // not need to deal with this object.
272
273
274
      if (obj->type() == serializers::ArchiveRequest_t) {
        ArchiveRequest ar(*obj);
        for (auto & j:ar.dumpJobs()) if (j.owner == agent.getAddressIfSet()) goto doGCObject;
275
276
277
      } else {
        // Log the owner (except for archiveRequests which can have several owners).
        params2.add("actualOwner", obj->getAddressIfSet());
278
      }
279
      lc.log(log::WARNING, "In GarbageCollector::OwnedObjectSorter::sortFetchedObjects(): skipping object which is not owned by this agent");
280
281
      agent.removeFromOwnership(obj->getAddressIfSet());
      ownershipUdated=true;
282
283
284
285
286
287
288
289
290
291
292
      continue;
    }
  doGCObject:
    switch (obj->type()) {
      case serializers::ArchiveRequest_t:
      {
        // We need to find out in which queue or queues the owned job(s)
        // Decision is simple: if the job is owned and active, it needs to be requeued
        // in its destination archive queue.
        // Get hold of an (unlocked) archive request:
        std::shared_ptr<ArchiveRequest> ar(new ArchiveRequest(*obj));
293
        obj.reset();
294
295
296
297
        bool jobRequeued=false;
        for (auto &j: ar->dumpJobs()) {
          if ((j.owner == agent.getAddressIfSet() || ar->getOwner() == agent.getAddressIfSet())
              && !inactiveArchiveJobStatuses.count(j.status)) {
298
            archiveQueuesAndRequests[j.tapePool].emplace_back(ar);
299
300
301
302
303
            log::ScopedParamContainer params3(lc);
            params3.add("tapepool", j.tapePool)
                   .add("copynb", j.copyNb)
                   .add("fileId", ar->getArchiveFile().archiveFileID);
            lc.log(log::INFO, "Selected archive request for requeueing to tape pool");
304
            jobRequeued=true;
305
306
307
308
309
310
311
312
          }
        }
        if (!jobRequeued) {
          log::ScopedParamContainer params3(lc);
          params3.add("fileId", ar->getArchiveFile().archiveFileID);
          lc.log(log::INFO, "No active archive job to requeue found. Request will remain as-is.");
        }
        break;
313
      }
314
315
316
317
      case serializers::RetrieveRequest_t:
      {
        // We need here to re-determine the best tape (and queue) for the retrieve request.
        std::shared_ptr<RetrieveRequest> rr(new RetrieveRequest(*obj));
318
        obj.reset();
319
320
321
322
323
324
325
326
327
328
        // Get the list of vids for non failed tape files.
        std::set<std::string> candidateVids;
        for (auto & j: rr->dumpJobs()) {
          if (!inactiveRetrieveJobStatuses.count(j.status)) {
            candidateVids.insert(rr->getArchiveFile().tapeFiles.at(j.copyNb).vid);
          }
        }
        if (candidateVids.empty()) {
          log::ScopedParamContainer params3(lc);
          params3.add("fileId", rr->getArchiveFile().archiveFileID);
329
          lc.log(log::INFO, "No active retrieve job to requeue found. Marking request for normal GC (and probably deletion).");
330
          otherObjects.emplace_back(new GenericObject(rr->getAddressIfSet(), objectStore));
331
332
333
334
          break;
        }
        std::string vid;
        try {
335
          vid=Helpers::selectBestRetrieveQueue(candidateVids, catalogue, objectStore);
336
        } catch (Helpers::NoTapeAvailableForRetrieve & ex) {
337
          log::ScopedParamContainer params3(lc);
338
339
          params3.add("fileId", rr->getArchiveFile().archiveFileID);
          lc.log(log::INFO, "No available tape found. Marking request for normal GC (and probably deletion).");
340
          otherObjects.emplace_back(new GenericObject(rr->getAddressIfSet(), objectStore));
341
          break;
342
        }
343
        retrieveQueuesAndRequests[vid].emplace_back(rr);
344
345
346
347
348
349
350
        log::ScopedParamContainer params3(lc);
        // Find copyNb for logging
        size_t copyNb = std::numeric_limits<size_t>::max();
        uint64_t fSeq = std::numeric_limits<uint64_t>::max();
        for (auto & tc: rr->getArchiveFile().tapeFiles) { if (tc.second.vid==vid) { copyNb=tc.first; fSeq=tc.second.fSeq; } }
        params3.add("fileId", rr->getArchiveFile().archiveFileID)
               .add("copyNb", copyNb)
351
               .add("tapeVid", vid)
352
353
               .add("fSeq", fSeq);
        lc.log(log::INFO, "Selected vid to be requeued for retrieve request.");
354
355
356
        break;
      }
      default:
357
358
        // For other objects, we will not implement any optimization and simply call
        // their individual garbageCollect method.
359
        otherObjects.emplace_back(obj);
360
        obj.reset();
361
        break;
362
    }
363
    // We can now get rid of the generic object (data was transferred in a (typed) object in the sorter).
364
  }
365
366
367
  // We are now done with the next container.
  if (ownershipUdated) agent.commit();
  fetchedObjects.clear();
368
369
370
}

void GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(Agent& agent, AgentReference& agentReference, Backend & objectStore, log::LogContext & lc) {
371
372
373
374
  // We can now start updating the objects efficiently. We still need to re-fetch them locked 
  // and validate ownership.
  // 
  // 1) Get the archive requests done.
375
376
377
378
379
380
381
382
383
  for (auto & tapepool: archiveQueuesAndRequests) {
    // The number of objects to requeue could be very high. In order to limit the time taken by the
    // individual requeue operations, we limit the number of concurrently requeued objects to an 
    // arbitrary 500.
    while (tapepool.second.size()) {
      decltype (tapepool.second) currentJobBatch;
      while (tapepool.second.size() && currentJobBatch.size() <= 500) {
        currentJobBatch.emplace_back(std::move(tapepool.second.front()));
        tapepool.second.pop_front();
384
      }
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
      double queueLockFetchTime=0;
      double queueProcessAndCommitTime=0;
      double requestsUpdatePreparationTime=0;
      double requestsUpdatingTime=0;
      double queueRecommitTime=0;
      uint64_t filesQueued=0;
      uint64_t filesDequeued=0;
      uint64_t bytesQueued=0;
      uint64_t bytesDequeued=0;
      uint64_t filesBefore=0;
      uint64_t bytesBefore=0;
      utils::Timer t;
      // Get the archive queue and add references to the jobs in it.
      ArchiveQueue aq(objectStore);
      ScopedExclusiveLock aql;
Eric Cano's avatar
Eric Cano committed
400
      Helpers::getLockedAndFetchedQueue<ArchiveQueue>(aq, aql, agentReference, tapepool.first, QueueType::LiveJobs, lc);
401
402
403
404
405
406
407
408
409
410
      queueLockFetchTime = t.secs(utils::Timer::resetCounter);
      auto jobsSummary=aq.getJobsSummary();
      filesBefore=jobsSummary.jobs;
      bytesBefore=jobsSummary.bytes;
      // We have the queue. We will loop on the requests, add them to the queue. We will launch their updates 
      // after committing the queue.
      std::list<ArchiveQueue::JobToAdd> jtal;
      for (auto & ar: currentJobBatch) {
        // Determine the copy number and feed the queue with it.
        for (auto &j: ar->dumpJobs()) {
411
          if (j.tapePool == tapepool.first) {
412
413
            jtal.push_back({j, ar->getAddressIfSet(), ar->getArchiveFile().archiveFileID, 
                ar->getArchiveFile().fileSize, ar->getMountPolicy(), ar->getEntryLog().time});         
414
415
416
          }
        }
      }
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
      auto addedJobs = aq.addJobsIfNecessaryAndCommit(jtal, agentReference, lc);
      queueProcessAndCommitTime = t.secs(utils::Timer::resetCounter);
      // If we have an unexpected failure, we will re-run the individual garbage collection. Before that, 
      // we will NOT remove the object from agent's ownership. This variable is declared a bit ahead so
      // the goto will not cross its initialization.
      std::set<std::string> jobsIndividuallyGCed;
      if (!addedJobs.files) {
        goto agentCleanupForArchive;
      }
      // We will keep individual references for each job update we launch so that we make
      // no assumption (several jobs could be queued to the same pool, even if not expected
      // at the high level).
      struct ARUpdatersParams {
        std::unique_ptr<ArchiveRequest::AsyncJobOwnerUpdater> updater;
        std::shared_ptr<ArchiveRequest> archiveRequest;
        uint16_t copyNb;
      };
      {
        std::list<ARUpdatersParams> arUpdatersParams;
        for (auto & ar: currentJobBatch) {
          for (auto & j: ar->dumpJobs()) {
            if (j.tapePool == tapepool.first) {
              arUpdatersParams.emplace_back();
              arUpdatersParams.back().archiveRequest = ar;
              arUpdatersParams.back().copyNb = j.copyNb;
              arUpdatersParams.back().updater.reset(
                ar->asyncUpdateJobOwner(j.copyNb, aq.getAddressIfSet(), agent.getAddressIfSet()));
            }
          }
        }
        requestsUpdatePreparationTime = t.secs(utils::Timer::resetCounter);
        // Now collect the results.
        std::list<std::string> requestsToDequeue;
        for (auto & arup: arUpdatersParams) {
          try {
            arup.updater->wait();
            // OK, the job made it to the queue
454
455
456
457
            log::ScopedParamContainer params(lc);
            params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
                  .add("copyNb", arup.copyNb)
                  .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
458
459
460
                  .add("tapepool", tapepool.first)
                  .add("archiveQueueObject", aq.getAddressIfSet())
                  .add("garbageCollectedPreviousOwner", agent.getAddressIfSet());
461
            lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(): requeued archive job.");
462
463
464
465
466
467
468
469
470
471
472
          } catch (cta::exception::Exception & e) {
            // Update did not go through. It could be benign
            std::string debugType=typeid(e).name();
            if (typeid(e) == typeid(Backend::NoSuchObject)) {
              // The object was not present or not owned during update, so we skip it.
              // This is nevertheless unexpected (from previous fetch, so this is an error).
              log::ScopedParamContainer params(lc);
              params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
                    .add("copyNb", arup.copyNb)
                    .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
                    .add("exceptionType", debugType);
473
              lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(): failed to requeue gone/not owned archive job. Removing from queue.");
474
475
476
477
478
479
480
481
            } else {
              // We have an unexpected error. We will handle this with the request-by-request garbage collection.
              log::ScopedParamContainer params(lc);
              params.add("archiveRequestObject", arup.archiveRequest->getAddressIfSet())
                    .add("copyNb", arup.copyNb)
                    .add("fileId", arup.archiveRequest->getArchiveFile().archiveFileID)
                    .add("exceptionType", debugType)
                    .add("exceptionMessage", e.getMessageValue());
482
              lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(): failed to requeue archive job with unexpected error. Removing from queue and will re-run individual garbage collection.");
483
484
485
486
487
488
489
490
              // We will re-run the individual GC for this one.
              jobsIndividuallyGCed.insert(arup.archiveRequest->getAddressIfSet());
              otherObjects.emplace_back(new GenericObject(arup.archiveRequest->getAddressIfSet(), objectStore));
            }
            // In all cases, the object did NOT make it to the queue.
            filesDequeued ++;
            bytesDequeued += arup.archiveRequest->getArchiveFile().fileSize;
            requestsToDequeue.push_back(arup.archiveRequest->getAddressIfSet());
491
          }
492
493
494
495
496
497
        }
        requestsUpdatingTime = t.secs(utils::Timer::resetCounter);
        if (requestsToDequeue.size()) {
          aq.removeJobsAndCommit(requestsToDequeue);
          log::ScopedParamContainer params(lc);
          params.add("archiveQueueObject", aq.getAddressIfSet());
498
          lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(): Cleaned up and re-committed archive queue after error handling.");
499
          queueRecommitTime = t.secs(utils::Timer::resetCounter);
500
501
        }
      }
502
      {
503
        log::ScopedParamContainer params(lc);
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
        auto jobsSummary = aq.getJobsSummary();
        params.add("tapepool", tapepool.first)
              .add("archiveQueueObject", aq.getAddressIfSet())
              .add("filesAdded", filesQueued - filesDequeued)
              .add("bytesAdded", bytesQueued - bytesDequeued)
              .add("filesAddedInitially", filesQueued)
              .add("bytesAddedInitially", bytesQueued)
              .add("filesDequeuedAfterErrors", filesDequeued)
              .add("bytesDequeuedAfterErrors", bytesDequeued)
              .add("filesBefore", filesBefore)
              .add("bytesBefore", bytesBefore)
              .add("filesAfter", jobsSummary.jobs)
              .add("bytesAfter", jobsSummary.bytes)
              .add("queueLockFetchTime", queueLockFetchTime)
              .add("queueProcessAndCommitTime", queueProcessAndCommitTime)
              .add("requestsUpdatePreparationTime", requestsUpdatePreparationTime)
              .add("requestsUpdatingTime", requestsUpdatingTime)
              .add("queueRecommitTime", queueRecommitTime);
522
        lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateArchiveJobs(): Requeued a batch of archive requests.");
523
      }
524
525
526
527
528
529
530
531
532
533
534
535
536
      // We can now forget pool level list. But before that, we can remove the objects 
      // from agent ownership if this was the last reference to it.
      // The usage of use_count() is safe here because we are in a single threaded environment.
      // In a multi threaded environment, its usage would not be appropriate.
      // See for example http://en.cppreference.com/w/cpp/memory/shared_ptr/use_count
    agentCleanupForArchive:
      bool ownershipUpdated=false;
      for (auto &ar: currentJobBatch) {
        if (ar.use_count() == 1 && !jobsIndividuallyGCed.count(ar->getAddressIfSet())) {
          // This tapepool is the last users of this archive request. We will remove is from ownership.
          agent.removeFromOwnership(ar->getAddressIfSet());
          ownershipUpdated=true;
        }
537
      }
538
539
540
541
      if (ownershipUpdated) agent.commit();
      currentJobBatch.clear();
      // Sleep a bit if we have oher rounds to go not to hog the queue
      if (tapepool.second.size()) sleep (5);
542
543
    }
  }
544
545
546
}

void GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(Agent& agent, AgentReference& agentReference, Backend & objectStore, log::LogContext & lc) {  
547
548
  // 2) Get the retrieve requests done. They are simpler as retrieve requests are fully owned.
  // Then should hence not have changes since we pre-fetched them.
549
550
551
552
553
554
  for (auto & tape: retrieveQueuesAndRequests) {
    while (tape.second.size()) {
      decltype (tape.second) currentJobBatch;
      while (tape.second.size() && currentJobBatch.size() <= 500) {
        currentJobBatch.emplace_back(std::move(tape.second.front()));
        tape.second.pop_front();
555
      }
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
      double queueLockFetchTime=0;
      double queueProcessAndCommitTime=0;
      double requestsUpdatePreparationTime=0;
      double requestsUpdatingTime=0;
      double queueRecommitTime=0;
      uint64_t filesQueued=0;
      uint64_t filesDequeued=0;
      uint64_t bytesQueued=0;
      uint64_t bytesDequeued=0;
      uint64_t filesBefore=0;
      uint64_t bytesBefore=0;
      utils::Timer t;
      // Get the retrieve queue and add references to the jobs to it.
      RetrieveQueue rq(objectStore);
      ScopedExclusiveLock rql;
Eric Cano's avatar
Eric Cano committed
571
      Helpers::getLockedAndFetchedQueue<RetrieveQueue>(rq,rql, agentReference, tape.first, QueueType::LiveJobs, lc);
572
573
574
575
576
577
578
579
580
581
582
      queueLockFetchTime = t.secs(utils::Timer::resetCounter);
      auto jobsSummary=rq.getJobsSummary();
      filesBefore=jobsSummary.files;
      bytesBefore=jobsSummary.bytes;
      // Prepare the list of requests to add to the queue (if needed).
      std::list<RetrieveQueue::JobToAdd> jta;
      // We have the queue. We will loop on the requests, add them to the list. We will launch their updates 
      // after committing the queue.
      for (auto & rr: currentJobBatch) {
        // Determine the copy number and feed the queue with it.
        for (auto &tf: rr->getArchiveFile().tapeFiles) {
583
          if (tf.second.vid == tape.first) {
584
585
            jta.push_back({tf.second.copyNb, tf.second.fSeq, rr->getAddressIfSet(), rr->getArchiveFile().fileSize, 
                rr->getRetrieveFileQueueCriteria().mountPolicy, rr->getEntryLog().time});
586
587
588
          }
        }
      }
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
      auto addedJobs = rq.addJobsIfNecessaryAndCommit(jta, agentReference, lc);
      queueProcessAndCommitTime = t.secs(utils::Timer::resetCounter);
      // If we have an unexpected failure, we will re-run the individual garbage collection. Before that, 
      // we will NOT remove the object from agent's ownership. This variable is declared a bit ahead so
      // the goto will not cross its initialization.
      std::set<std::string> jobsIndividuallyGCed;
      if (!addedJobs.files) {
        goto agentCleanupForRetrieve;
      }
      // We will keep individual references for each job update we launch so that we make
      // our life easier downstream.
      struct RRUpdatedParams {
        std::unique_ptr<RetrieveRequest::AsyncOwnerUpdater> updater;
        std::shared_ptr<RetrieveRequest> retrieveRequest;
        uint16_t copyNb;
      };
      {
        std::list<RRUpdatedParams> rrUpdatersParams;
        for (auto & rr: currentJobBatch) {
          for (auto & tf: rr->getArchiveFile().tapeFiles) {
            if (tf.second.vid == tape.first) {
              rrUpdatersParams.emplace_back();
              rrUpdatersParams.back().retrieveRequest = rr;
              rrUpdatersParams.back().copyNb = tf.second.copyNb;
              rrUpdatersParams.back().updater.reset(rr->asyncUpdateOwner(tf.second.copyNb,
                  rq.getAddressIfSet(), agent.getAddressIfSet()));
            }
          }
        }
        requestsUpdatePreparationTime = t.secs(utils::Timer::resetCounter);
        // Now collect the results.
        std::list<std::string> requestsToDequeue;
        for (auto & rrup: rrUpdatersParams) {
          try {
            rrup.updater->wait();
            // OK, the job made it to the queue
625
626
627
628
            log::ScopedParamContainer params(lc);
            params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                  .add("copyNb", rrup.copyNb)
                  .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
629
                  .add("tapeVid", tape.first)
630
631
                  .add("retreveQueueObject", rq.getAddressIfSet())
                  .add("garbageCollectedPreviousOwner", agent.getAddressIfSet());
632
            lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): requeued retrieve job.");
633
634
635
636
          } catch (cta::exception::Exception & e) {
           // Update did not go through. It could be benign
            std::string debugType=typeid(e).name();
            if (typeid(e) == typeid(Backend::NoSuchObject) ||
637
                typeid(e) == typeid(Backend::WrongPreviousOwner)) {
638
639
640
641
642
643
644
              // The object was not present or not owned during update, so we skip it.
              // This is nevertheless unexpected (from previous fetch, so this is an error).
              log::ScopedParamContainer params(lc);
              params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                    .add("copyNb", rrup.copyNb)
                    .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
                    .add("exceptionType", debugType);
645
              lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): failed to requeue gone/not owned retrieve job. Removing from queue.");
646
647
648
649
650
651
652
653
654
            } else {
              // We have an unexpected error. Log it, and remove form queue. Not much we can
              // do at this point.
              log::ScopedParamContainer params(lc);
              params.add("retrieveRequestObject", rrup.retrieveRequest->getAddressIfSet())
                    .add("copyNb", rrup.copyNb)
                    .add("fileId", rrup.retrieveRequest->getArchiveFile().archiveFileID)
                    .add("exceptionType", debugType)
                    .add("exceptionMessage", e.getMessageValue());
655
              lc.log(log::ERR, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): failed to requeue retrieve job with unexpected error. Removing from queue and will re-run individual garbage collection.");
656
657
658
659
660
661
662
663
              // We will re-run the individual GC for this one.
              jobsIndividuallyGCed.insert(rrup.retrieveRequest->getAddressIfSet());
              otherObjects.emplace_back(new GenericObject(rrup.retrieveRequest->getAddressIfSet(), objectStore));
            }
            // In all cases, the object did NOT make it to the queue.
            filesDequeued ++;
            bytesDequeued += rrup.retrieveRequest->getArchiveFile().fileSize;
            requestsToDequeue.push_back(rrup.retrieveRequest->getAddressIfSet());
664
          }
665
666
667
668
669
670
        }
        requestsUpdatingTime = t.secs(utils::Timer::resetCounter);
        if (requestsToDequeue.size()) {
          rq.removeJobsAndCommit(requestsToDequeue);
          log::ScopedParamContainer params(lc);
          params.add("retreveQueueObject", rq.getAddressIfSet());
671
          lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): Cleaned up and re-committed retrieve queue after error handling.");
672
          queueRecommitTime = t.secs(utils::Timer::resetCounter);
673
674
        }
      }
675
      {
676
        log::ScopedParamContainer params(lc);
677
        auto jobsSummary = rq.getJobsSummary();
678
        params.add("tapeVid", tape.first)
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
              .add("retrieveQueueObject", rq.getAddressIfSet())
              .add("filesAdded", filesQueued - filesDequeued)
              .add("bytesAdded", bytesQueued - bytesDequeued)
              .add("filesAddedInitially", filesQueued)
              .add("bytesAddedInitially", bytesQueued)
              .add("filesDequeuedAfterErrors", filesDequeued)
              .add("bytesDequeuedAfterErrors", bytesDequeued)
              .add("filesBefore", filesBefore)
              .add("bytesBefore", bytesBefore)
              .add("filesAfter", jobsSummary.files)
              .add("bytesAfter", jobsSummary.bytes)
              .add("queueLockFetchTime", queueLockFetchTime)
              .add("queuePreparationTime", queueProcessAndCommitTime)
              .add("requestsUpdatePreparationTime", requestsUpdatePreparationTime)
              .add("requestsUpdatingTime", requestsUpdatingTime)
              .add("queueRecommitTime", queueRecommitTime);
695
        lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateRetrieveJobs(): Requeued a batch of retrieve requests.");
696
      }
697
698
699
700
701
702
703
704
705
706
707
708
709
      // We can now forget pool level list. But before that, we can remove the objects 
      // from agent ownership if this was the last reference to it.
      // The usage of use_count() is safe here because we are in a single threaded environment.
      // In a multi threaded environment, its usage would not be appropriate.
      // See for example http://en.cppreference.com/w/cpp/memory/shared_ptr/use_count
    agentCleanupForRetrieve:
      bool ownershipUpdated=false;
      for (auto &rr: currentJobBatch) {
        if (rr.use_count() == 1 && !jobsIndividuallyGCed.count(rr->getAddressIfSet())) {
          // This tapepool is the last users of this archive request. We will remove is from ownership.
          agent.removeFromOwnership(rr->getAddressIfSet());
          ownershipUpdated=true;
        }
710
      }
711
712
713
714
      if (ownershipUpdated) agent.commit();
      currentJobBatch.clear();
      // Sleep a bit if we have oher rounds to go not to hog the queue
      if (tape.second.size()) sleep (5);
715
716
    }
  }
717
718
719
}

void GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateOtherObjects(Agent& agent, AgentReference& agentReference, Backend & objectStore, cta::catalogue::Catalogue & catalogue, log::LogContext & lc) {  
720
721
  // 3) are done with the objects requiring mutualized queueing, and hence special treatement.
  // The rest will be garbage collected on a object-by-object basis.
722
  for (auto & go : otherObjects) { 
Eric Cano's avatar
Eric Cano committed
723
724
   // Find the object
   log::ScopedParamContainer params2(lc);
725
   params2.add("objectAddress", go->getAddressIfSet());
Eric Cano's avatar
Eric Cano committed
726
   // If the object does not exist, we're done.
727
728
729
   if (go->exists()) {
     ScopedExclusiveLock goLock(*go);
     go->fetch();
Eric Cano's avatar
Eric Cano committed
730
731
     // Call GenericOpbject's garbage collect method, which in turn will
     // delegate to the object type's garbage collector.
732
     go->garbageCollectDispatcher(goLock, agent.getAddressIfSet(), agentReference, lc, catalogue);
733
     lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateOtherObjects(): garbage collected owned object.");
Eric Cano's avatar
Eric Cano committed
734
   } else {
735
     lc.log(log::INFO, "In GarbageCollector::OwnedObjectSorter::lockFetchAndUpdateOtherObjects(): skipping garbage collection of now gone object.");
736
   }
Eric Cano's avatar
Eric Cano committed
737
   // In all cases, relinquish ownership for this object
738
   agent.removeFromOwnership(go->getAddressIfSet());
Eric Cano's avatar
Eric Cano committed
739
740
741
   agent.commit();
  }
  // We now processed all the owned objects. We can delete the agent's entry
742
  agent.removeAndUnregisterSelf(lc);
Eric Cano's avatar
Eric Cano committed
743
  lc.log(log::INFO, "In GarbageCollector::cleanupDeadAgent(): agent entry removed.");
744
  // We can remove the agent from our own ownership.
745
  agentReference.removeFromOwnership(agent.getAddressIfSet(), objectStore);
Eric Cano's avatar
Eric Cano committed
746
747
}

748
}}