diff --git a/ReleaseNotes.md b/ReleaseNotes.md index acb83d252ad72d23cdee018cf204dbf00293f76f..664a56a436e66719becec98521d91056205bd364 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -1,3 +1,15 @@ +# v4.NEXT + +## Summary + +## Upgrade Instructions + +## Features + +## Bug fixes +- cta/CTA#1101 - Fix disk space reservation logic adding all existing disk space reservations for all disk systems +- cta/CTA#1023 - Retrieve puts the queue to sleep if the eos disk instance is not reachable + # v4.5.0-1 ## Summary @@ -15,7 +27,6 @@ ## Bug fixes - cta/CTA#1092 - Fix overflow error with drive state latestBandwith causing cta frontend crash - cta/CTA#501 - Fix disappearing reason when TapeDrive is reading or writing -- cta/CTA#1101 - Fix disk space reservation logic adding all existing disk space reservations for all disk systems # v4.4.1-1 diff --git a/scheduler/OStoreDB/OStoreDB.cpp b/scheduler/OStoreDB/OStoreDB.cpp index 0d830466abe75f2283a6b2176f03a68882535e33..38e98c9938e0fa8a27b64025d731ca9d458cfd61 100644 --- a/scheduler/OStoreDB/OStoreDB.cpp +++ b/scheduler/OStoreDB/OStoreDB.cpp @@ -3448,12 +3448,14 @@ bool OStoreDB::RetrieveMount::reserveDiskSpace(const cta::DiskSpaceReservationRe diskSystemFreeSpace.fetchDiskSystemFreeSpace(diskSystemNames, logContext); } catch(const cta::disk::DiskSystemFreeSpaceListException &ex){ // Could not get free space for one of the disk systems. Currently the retrieve mount will only query - // one disk system, so just log the failure. Cannot put the queue to sleep as that implies knowing the disk system sleep time. + // one disk system, so just log the failure and put the queue to sleep inside the loop. for (const auto &failedDiskSystem: ex.m_failedDiskSystems) { cta::log::ScopedParamContainer params(logContext); params.add("diskSystemName", failedDiskSystem.first); params.add("failureReason", failedDiskSystem.second.getMessageValue()); - logContext.log(cta::log::ERR, "In OStoreDB::RetrieveMount::reserveDiskSpace(): unable to request EOS free space for disk system"); + logContext.log(cta::log::ERR, "In OStoreDB::RetrieveMount::reserveDiskSpace(): unable to request EOS free space for disk system, putting queue to sleep"); + auto sleepTime = diskSystemFreeSpace.getDiskSystemList().at(failedDiskSystem.first).sleepTime; + putQueueToSleep(failedDiskSystem.first, sleepTime, logContext); } return false; } catch (std::exception &ex) { diff --git a/scheduler/SchedulerDatabaseTest.cpp b/scheduler/SchedulerDatabaseTest.cpp index 00b43cfbbbe72b12a69247b49dbebc002e98baa2..41a42993218511920b96467ca658931a4b14af3b 100644 --- a/scheduler/SchedulerDatabaseTest.cpp +++ b/scheduler/SchedulerDatabaseTest.cpp @@ -614,7 +614,7 @@ TEST_P(SchedulerDatabaseTest, popRetrieveRequestsWithBackpressure) { reservationRequest.addRequest(rj->diskSystemName.value(), rj->archiveFile.fileSize); } } - //reserving disk space second time will fail (not enough disk space, triggers backpressure) + //reserving disk space will fail (not enough disk space, backpressure is triggered) ASSERT_FALSE(rm->reserveDiskSpace(reservationRequest, "", lc)); } rm->complete(time(nullptr)); @@ -692,20 +692,30 @@ TEST_P(SchedulerDatabaseTest, popRetrieveRequestsWithDiskSystemNotFetcheable) { auto mountInfo = db.getMountInfo(lc); ASSERT_EQ(1, mountInfo->potentialMounts.size()); auto rm=mountInfo->createRetrieveMount("vid", "tapePool", "drive", "library", "host", "vo","mediaType", "vendor",123456789,time(nullptr), cta::nullopt); - auto rjb = rm->getNextJobBatch(20,20*1000,lc); - //Files with successful fetch should be popped - ASSERT_EQ(filesToDo, rjb.size()); + { + //leave one job in the queue for the potential mount + auto rjb = rm->getNextJobBatch(9,20*1000,lc); + //Files with successful fetch should be popped + ASSERT_EQ(9, rjb.size()); - cta::DiskSpaceReservationRequest reservationRequest; + cta::DiskSpaceReservationRequest reservationRequest; for (auto &rj: rjb) { ASSERT_TRUE((bool)rj->diskSystemName); ASSERT_EQ("ds-Error", rj->diskSystemName.value()); - if (rj->diskSystemName) { - reservationRequest.addRequest(rj->diskSystemName.value(), rj->archiveFile.fileSize); - } + reservationRequest.addRequest(rj->diskSystemName.value(), rj->archiveFile.fileSize); } - //reserving disk space second time will fail (disk instance not reachable) + //reserving disk space will fail because the disk instance is not reachable, causing backpressure ASSERT_FALSE(rm->reserveDiskSpace(reservationRequest, "", lc)); + } + rm->complete(time(nullptr)); + rm.reset(nullptr); + mountInfo.reset(nullptr); + auto mi = db.getMountInfoNoLock(cta::SchedulerDatabase::PurposeGetMountInfo::GET_NEXT_MOUNT,lc); + ASSERT_EQ(1, mi->potentialMounts.size()); + //did not requeue the job batch (the retrive mount normally does this, but cannot do it in the tests due to BackendVFS) + ASSERT_EQ(1, mi->potentialMounts.begin()->filesQueued); + ASSERT_TRUE(mi->potentialMounts.begin()->sleepingMount); + ASSERT_EQ("ds-Error", mi->potentialMounts.begin()->diskSystemSleptFor); } #undef TEST_MOCK_DB