Commit c2462eec authored by Eric Cano's avatar Eric Cano
Browse files

Added better measurement of the disk performance to locate slow disk servers...

Added better measurement of the disk performance to locate slow disk servers during tests in production.
parent 6b9e9c10
......@@ -63,7 +63,7 @@ void DiskReadTask::execute(log::LogContext& lc, diskFile::DiskFileFactory & file
fileFactory.createReadFile(m_migratedFile->path()));
log::ScopedParamContainer URLcontext(lc);
URLcontext.add("path", m_migratedFile->path())
.add("actualURL", sourceFile->URL());
.add("actualURL", sourceFile->URL());
if(migratingFileSize != sourceFile->size()){
throw castor::exception::Exception("Mismtach between size given by the client "
"and the real one");
......@@ -86,7 +86,7 @@ void DiskReadTask::execute(log::LogContext& lc, diskFile::DiskFileFactory & file
mb->m_fileBlock = blockId++;
migratingFileSize -= mb->m_payload.read(*sourceFile);
m_stats.transferTime+=localTime.secs(castor::utils::Timer::resetCounter);
m_stats.readWriteTime+=localTime.secs(castor::utils::Timer::resetCounter);
m_stats.dataVolume += mb->m_payload.size();
......@@ -106,6 +106,9 @@ void DiskReadTask::execute(log::LogContext& lc, diskFile::DiskFileFactory & file
} //end of while(migratingFileSize>0)
m_stats.filesCount++;
m_stats.totalTime = totalTime.secs();
// We do not have delayed open like in disk writes, so time spent
// transferring equals total time.
m_stats.transferTime = m_stats.totalTime;
logWithStat(LOG_INFO, "File successfully read from disk", lc);
}
catch(const castor::tape::tapeserver::daemon::ErrorFlag&){
......@@ -163,15 +166,21 @@ void DiskReadTask::circulateAllBlocks(size_t fromBlockId, MemBlock * mb){
//------------------------------------------------------------------------------
void DiskReadTask::logWithStat(int level,const std::string& msg,log::LogContext& lc){
log::ScopedParamContainer params(lc);
params.addSnprintfDouble("transferTime", m_stats.transferTime)
params.addSnprintfDouble("readWriteTime", m_stats.readWriteTime)
.addSnprintfDouble("checksumingTime",m_stats.checksumingTime)
.addSnprintfDouble("waitDataTime",m_stats.waitDataTime)
.addSnprintfDouble("waitReportingTime",m_stats.waitReportingTime)
.addSnprintfDouble("checkingErrorTime",m_stats.checkingErrorTime)
.addSnprintfDouble("openingTime",m_stats.openingTime)
.addSnprintfDouble("transferTime", m_stats.transferTime)
.addSnprintfDouble("totalTime", m_stats.totalTime)
.addSnprintfDouble("filePayloadTransferSpeedMBps",
.add("dataVolume", m_stats.dataVolume)
.addSnprintfDouble("globalPayloadTransferSpeedMBps",
m_stats.totalTime?1.0*m_stats.dataVolume/1000/1000/m_stats.totalTime:0)
.addSnprintfDouble("diskPerformanceMBps",
m_stats.transferTime?1.0*m_stats.dataVolume/1000/1000/m_stats.transferTime:0)
.addSnprintfDouble("readWriteToTransferTimeRatio",
m_stats.transferTime?m_stats.readWriteTime/m_stats.transferTime:0.0)
.add("FILEID",m_migratedFile->fileid())
.add("path",m_migratedFile->path());
lc.log(level,msg);
......
......@@ -46,7 +46,7 @@ DiskReadThreadPool::DiskReadThreadPool(int nbThread, uint64_t maxFilesReq,uint64
DiskReadWorkerThread * thr = new DiskReadWorkerThread(*this);
m_threads.push_back(thr);
m_lc.pushOrReplace(log::Param("threadID",i));
m_lc.log(LOG_INFO, "DiskReadWorkerThread created");
m_lc.log(LOG_DEBUG, "DiskReadWorkerThread created");
}
}
......@@ -58,7 +58,7 @@ DiskReadThreadPool::~DiskReadThreadPool() {
delete m_threads.back();
m_threads.pop_back();
}
logWithStat(LOG_INFO, "All the DiskReadWorkerThreads have been destroyed");
m_lc.log(LOG_DEBUG, "Deleted threads in DiskReadThreadPool::~DiskReadThreadPool");
}
//------------------------------------------------------------------------------
......@@ -131,15 +131,20 @@ void DiskReadThreadPool::addThreadStats(const DiskStats& other){
void DiskReadThreadPool::logWithStat(int level, const std::string& message){
m_pooldStat.totalTime = m_totalTime.secs();
log::ScopedParamContainer params(m_lc);
params.addSnprintfDouble("poolTransferTime", m_pooldStat.transferTime)
params.addSnprintfDouble("poolReadWriteTime", m_pooldStat.readWriteTime)
.addSnprintfDouble("poolWaitFreeMemoryTime",m_pooldStat.waitFreeMemoryTime)
.addSnprintfDouble("poolCheckingErrorTime",m_pooldStat.checkingErrorTime)
.addSnprintfDouble("poolOpeningTime",m_pooldStat.openingTime)
.addSnprintfDouble("poolTransferTime", m_pooldStat.transferTime)
.addSnprintfDouble("poolRealTime",m_pooldStat.totalTime)
.add("poolFileCount",m_pooldStat.filesCount)
.add("poolDataVolume", m_pooldStat.dataVolume)
.addSnprintfDouble("AveragePoolPayloadTransferSpeedMBps",
m_pooldStat.totalTime?1.0*m_pooldStat.dataVolume/1000/1000/m_pooldStat.totalTime:0);
.addSnprintfDouble("poolGlobalPayloadTransferSpeedMBps",
m_pooldStat.totalTime?1.0*m_pooldStat.dataVolume/1000/1000/m_pooldStat.totalTime:0)
.addSnprintfDouble("poolAverageDiskPerformanceMBps",
m_pooldStat.transferTime?1.0*m_pooldStat.dataVolume/1000/1000/m_pooldStat.transferTime:0.0)
.addSnprintfDouble("poolReadWriteToTransferTimeRatio",
m_pooldStat.transferTime?m_pooldStat.readWriteTime/m_pooldStat.transferTime:0.0);
m_lc.log(level,message);
}
//------------------------------------------------------------------------------
......@@ -189,15 +194,20 @@ void DiskReadThreadPool::DiskReadWorkerThread::run() {
void DiskReadThreadPool::DiskReadWorkerThread::
logWithStat(int level, const std::string& message){
log::ScopedParamContainer params(m_lc);
params.addSnprintfDouble("threadTransferTime", m_threadStat.transferTime)
params.addSnprintfDouble("threadReadWriteTime", m_threadStat.readWriteTime)
.addSnprintfDouble("threadWaitFreeMemoryTime",m_threadStat.waitFreeMemoryTime)
.addSnprintfDouble("threadCheckingErrorTime",m_threadStat.checkingErrorTime)
.addSnprintfDouble("threadOpeningTime",m_threadStat.openingTime)
.addSnprintfDouble("threadTransferTime",m_threadStat.transferTime)
.addSnprintfDouble("threadTotalTime",m_threadStat.totalTime)
.addSnprintfDouble("threaDataVolumeInMB", 1.0*m_threadStat.dataVolume/1000/1000)
.addSnprintfDouble("threadPayloadTransferSpeedMBps",
m_threadStat.totalTime?1.0*m_threadStat.dataVolume/1000/1000/m_threadStat.totalTime:0)
.addSnprintfDouble("totalTime",m_threadStat.totalTime);
.add("threadDataVolume",m_threadStat.dataVolume)
.add("threadFileCount",m_threadStat.filesCount)
.addSnprintfDouble("threadGlobalPayloadTransferSpeedMBps",
m_threadStat.totalTime?1.0*m_threadStat.dataVolume/1000/1000/m_threadStat.totalTime:0)
.addSnprintfDouble("threadAverageDiskPerformanceMBps",
m_threadStat.transferTime?1.0*m_threadStat.dataVolume/1000/1000/m_threadStat.transferTime:0.0)
.addSnprintfDouble("threadReadWriteToTransferTimeRatio",
m_threadStat.transferTime?m_threadStat.readWriteTime/m_threadStat.transferTime:0.0);
m_lc.log(level,message);
}
}}}}
......
......@@ -41,8 +41,8 @@ namespace daemon {
/** Cumulated time spent computing checksums */
double checksumingTime;
/** Cumulated time spent transfering data with the disk */
double transferTime;
/** Cumulated time spent calling read/write methods on the disk client */
double readWriteTime;
/** Cumulated time spent waiting for data blocks. */
double waitDataTime;
......@@ -59,6 +59,9 @@ namespace daemon {
/** Cumulated time spent reporting */
double checkingErrorTime;
/** Time spent between the opening of the file and the completion of the transfer */
double transferTime;
/** Total real time spent by the thread/pool. */
double totalTime;
......@@ -72,12 +75,13 @@ namespace daemon {
DiskStats(): openingTime(0.0),
closingTime(0.0),
checksumingTime(0.0),
transferTime(0.0),
readWriteTime(0.0),
waitDataTime(0.0),
waitFreeMemoryTime(0.0),
waitInstructionsTime(0.0),
waitReportingTime(0.0),
checkingErrorTime(0.0),
transferTime(0.0),
totalTime(0.0),
dataVolume(0),
filesCount(0) {}
......@@ -88,13 +92,14 @@ namespace daemon {
openingTime += other.openingTime;
closingTime += other.closingTime;
checksumingTime += other.checksumingTime;
transferTime += other.transferTime;
readWriteTime += other.readWriteTime;
waitDataTime += other.waitDataTime;
waitFreeMemoryTime += other.waitFreeMemoryTime;
waitInstructionsTime += other.waitInstructionsTime;
waitReportingTime += other.waitReportingTime;
checkingErrorTime += other.checkingErrorTime;
// total time is not a cumulative member. It is used to represent wallclock time.
transferTime += other.transferTime; // Transfer times can be cumulated and used to compute the average file bandwidth over a disk pool
// total time is not a cumulative member. It is used to represent real time.
filesCount += other.filesCount;
dataVolume += other.dataVolume;
}
......
......@@ -49,6 +49,7 @@ bool DiskWriteTask::execute(RecallReportPacker& reporter,log::LogContext& lc,
using log::Param;
castor::utils::Timer localTime;
castor::utils::Timer totalTime(localTime);
castor::utils::Timer transferTime(localTime);
try{
// Placeholder for the disk file. We will open it only
// after getting a first correct memory block.
......@@ -76,6 +77,7 @@ bool DiskWriteTask::execute(RecallReportPacker& reporter,log::LogContext& lc,
// the disk file for writing...
if (!writeFile.get()) {
lc.log(LOG_INFO, "About to open disk file for writing");
transferTime.reset();
writeFile.reset(fileFactory.createWriteFile(m_recallingFile->path()));
URLcontext.add("actualURL", writeFile->URL());
lc.log(LOG_INFO, "Opened disk file for writing");
......@@ -85,7 +87,7 @@ bool DiskWriteTask::execute(RecallReportPacker& reporter,log::LogContext& lc,
// Write the data.
m_stats.dataVolume+=mb->m_payload.size();
mb->m_payload.write(*writeFile);
m_stats.transferTime+=localTime.secs(castor::utils::Timer::resetCounter);
m_stats.readWriteTime+=localTime.secs(castor::utils::Timer::resetCounter);
checksum = mb->m_payload.adler32(checksum);
m_stats.checksumingTime+=localTime.secs(castor::utils::Timer::resetCounter);
......@@ -104,6 +106,7 @@ bool DiskWriteTask::execute(RecallReportPacker& reporter,log::LogContext& lc,
} //end of while(1)
reporter.reportCompletedJob(*m_recallingFile,checksum,m_stats.dataVolume);
m_stats.waitReportingTime+=localTime.secs(castor::utils::Timer::resetCounter);
m_stats.transferTime = transferTime.secs();
m_stats.totalTime = totalTime.secs();
logWithStat(LOG_INFO, "File successfully transfered to disk",lc);
......@@ -207,16 +210,22 @@ const DiskStats DiskWriteTask::getTaskStats() const{
//------------------------------------------------------------------------------
void DiskWriteTask::logWithStat(int level,const std::string& msg,log::LogContext& lc){
log::ScopedParamContainer params(lc);
params.addSnprintfDouble("transferTime", m_stats.transferTime)
params.addSnprintfDouble("readWriteTime", m_stats.readWriteTime)
.addSnprintfDouble("checksumingTime",m_stats.checksumingTime)
.addSnprintfDouble("waitDataTime",m_stats.waitDataTime)
.addSnprintfDouble("waitReportingTime",m_stats.waitReportingTime)
.addSnprintfDouble("checkingErrorTime",m_stats.checkingErrorTime)
.addSnprintfDouble("openingTime",m_stats.openingTime)
.addSnprintfDouble("closingTime",m_stats.closingTime)
.addSnprintfDouble("transferTime", m_stats.transferTime)
.addSnprintfDouble("totalTime", m_stats.totalTime)
.addSnprintfDouble("filePayloadTransferSpeedMBps",
.add("dataVolume", m_stats.dataVolume)
.addSnprintfDouble("globalPayloadTransferSpeedMBps",
m_stats.totalTime?1.0*m_stats.dataVolume/1000/1000/m_stats.totalTime:0)
.addSnprintfDouble("diskPerformanceMBps",
m_stats.transferTime?1.0*m_stats.dataVolume/1000/1000/m_stats.transferTime:0)
.addSnprintfDouble("readWriteToTransferTimeRatio",
m_stats.transferTime?m_stats.readWriteTime/m_stats.transferTime:0.0)
.add("FILEID",m_recallingFile->fileid())
.add("path",m_recallingFile->path());
lc.log(level,msg);
......
......@@ -123,7 +123,7 @@ void DiskWriteThreadPool::addThreadStats(const DiskStats& other){
void DiskWriteThreadPool::logWithStat(int level, const std::string& message){
m_pooldStat.totalTime = m_totalTime.secs();
log::ScopedParamContainer params(m_lc);
params.addSnprintfDouble("poolTransferTime", m_pooldStat.transferTime)
params.addSnprintfDouble("poolReadWriteTime", m_pooldStat.readWriteTime)
.addSnprintfDouble("poolChecksumingTime",m_pooldStat.checksumingTime)
.addSnprintfDouble("poolWaitDataTime",m_pooldStat.waitDataTime)
.addSnprintfDouble("poolWaitReportingTime",m_pooldStat.waitReportingTime)
......@@ -133,8 +133,12 @@ void DiskWriteThreadPool::logWithStat(int level, const std::string& message){
.addSnprintfDouble("poolRealTime",m_pooldStat.totalTime)
.add("poolFileCount",m_pooldStat.filesCount)
.add("poolDataVolume", m_pooldStat.dataVolume)
.addSnprintfDouble("AveragePoolPayloadTransferSpeedMBps",
m_pooldStat.totalTime?1.0*m_pooldStat.dataVolume/1000/1000/m_pooldStat.totalTime:0);
.addSnprintfDouble("poolGlobalPayloadTransferSpeedMBps",
m_pooldStat.totalTime?1.0*m_pooldStat.dataVolume/1000/1000/m_pooldStat.totalTime:0)
.addSnprintfDouble("poolAverageDiskPerformanceMBps",
m_pooldStat.transferTime?1.0*m_pooldStat.dataVolume/1000/1000/m_pooldStat.transferTime:0.0)
.addSnprintfDouble("poolReadWriteToTransferTimeRatio",
m_pooldStat.transferTime?m_pooldStat.readWriteTime/m_pooldStat.transferTime:0.0);
m_lc.log(level,message);
}
//------------------------------------------------------------------------------
......@@ -192,18 +196,23 @@ void DiskWriteThreadPool::DiskWriteWorkerThread::run() {
void DiskWriteThreadPool::DiskWriteWorkerThread::
logWithStat(int level, const std::string& msg) {
log::ScopedParamContainer params(m_lc);
params.addSnprintfDouble("threadTransferTime", m_threadStat.transferTime)
params.addSnprintfDouble("threadReadWriteTime", m_threadStat.readWriteTime)
.addSnprintfDouble("threadChecksumingTime",m_threadStat.checksumingTime)
.addSnprintfDouble("threadWaitDataTime",m_threadStat.waitDataTime)
.addSnprintfDouble("threadWaitReportingTime",m_threadStat.waitReportingTime)
.addSnprintfDouble("threadCheckingErrorTime",m_threadStat.checkingErrorTime)
.addSnprintfDouble("threadOpeningTime",m_threadStat.openingTime)
.addSnprintfDouble("threadClosingTime", m_threadStat.closingTime)
.addSnprintfDouble("threadTransferTime",m_threadStat.transferTime)
.addSnprintfDouble("threadTotalTime",m_threadStat.totalTime)
.add("threaDataVolumeInMB", 1.0*m_threadStat.dataVolume/1000/1000)
.add("threadDataVolume",m_threadStat.dataVolume)
.add("threadFileCount",m_threadStat.filesCount)
.add("threadPayloadTransferSpeedMBps",
m_threadStat.totalTime?1.0*m_threadStat.dataVolume/1000/1000/m_threadStat.totalTime:0);
.addSnprintfDouble("threadGlobalPayloadTransferSpeedMBps",
m_threadStat.totalTime?1.0*m_threadStat.dataVolume/1000/1000/m_threadStat.totalTime:0)
.addSnprintfDouble("threadAverageDiskPerformanceMBps",
m_threadStat.transferTime?1.0*m_threadStat.dataVolume/1000/1000/m_threadStat.transferTime:0.0)
.addSnprintfDouble("threadReadWriteToTransferTimeRatio",
m_threadStat.transferTime?m_threadStat.readWriteTime/m_threadStat.transferTime:0.0);
m_lc.log(level,msg);
}
}}}}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment