diff --git a/tapeserver/daemon/DriveHandler.cpp b/tapeserver/daemon/DriveHandler.cpp index 7e6faf60702ff212133be26e4894fb59c21e732b..4462e032ec63b8778a1ddbb25f3f078a53085f4a 100644 --- a/tapeserver/daemon/DriveHandler.cpp +++ b/tapeserver/daemon/DriveHandler.cpp @@ -182,15 +182,32 @@ SubprocessHandler::ProcessingStatus DriveHandler::fork() { decltype (SubprocessHandler::ProcessingStatus::nextTimeout) DriveHandler::nextTimeout() { // If a timeout is defined, then we compute its expiration time. Else we just give default timeout (end of times). decltype (SubprocessHandler::ProcessingStatus::nextTimeout) ret=decltype(SubprocessHandler::ProcessingStatus::nextTimeout)::max(); + bool retSet=false; try { ret=m_lastStateChangeTime+m_stateChangeTimeouts.at(m_sessionState); + retSet=true; + m_timeoutType="StateChange"; } catch (...) {} try { - ret=std::min(ret, m_lastHeartBeatTime+m_heartbeatTimeouts.at(m_sessionState)); + auto newRet=m_lastHeartBeatTime+m_heartbeatTimeouts.at(m_sessionState); + if (newRet < ret) { + ret=newRet; + retSet=true; + m_timeoutType="Heartbeat"; + } } catch (...) {} try { - ret=std::min(ret, m_lastDataMovementTime+m_dataMovementTimeouts.at(m_sessionState)); + auto newRet=m_lastDataMovementTime+m_dataMovementTimeouts.at(m_sessionState); + if (newRet < ret) { + ret=newRet; + retSet=true; + m_timeoutType="DataMovement"; + } } catch (...) {} + if (retSet) { + m_sessionStateWhenTimeoutDecided=m_sessionState; + m_sessionTypeWhenTimeoutDecided=m_sessionType; + } return ret; } @@ -753,9 +770,16 @@ SubprocessHandler::ProcessingStatus DriveHandler::processTimeout() { resetToDefault(PreviousSession::Crashed); return m_processingStatus; } - params.add("SessionState", session::toString(m_sessionState)) - .add("SesssionType", session::toString(m_sessionType)); auto now = std::chrono::steady_clock::now(); + params.add("SessionState", session::toString(m_sessionState)) + .add("SesssionType", session::toString(m_sessionType)) + .add("TimeoutType", m_timeoutType) + .add("SessionTypeWhenTimeoutDecided", session::toString(m_sessionTypeWhenTimeoutDecided)) + .add("SessionStateWhenTimeoutDecided", session::toString(m_sessionStateWhenTimeoutDecided)) + .add("LasteDataMovementTime", std::chrono::duration_cast<std::chrono::seconds>(m_lastDataMovementTime.time_since_epoch()).count()) + .add("LastHeartbeatTime", std::chrono::duration_cast<std::chrono::seconds>(m_lastHeartBeatTime.time_since_epoch()).count()) + .add("LastStateChnageTime", std::chrono::duration_cast<std::chrono::seconds>(m_lastStateChangeTime.time_since_epoch()).count()) + .add("Now", std::chrono::duration_cast<std::chrono::seconds>(now.time_since_epoch()).count()); // Log timeouts (if we have any) try { decltype (SubprocessHandler::ProcessingStatus::nextTimeout) nextTimeout = @@ -778,10 +802,10 @@ SubprocessHandler::ProcessingStatus DriveHandler::processTimeout() { try { params.add("SubprocessId", m_pid); exception::Errnum::throwOnMinusOne(::kill(m_pid, SIGKILL)); - m_processManager.logContext().log(log::INFO, "Killed subprocess."); + m_processManager.logContext().log(log::WARNING, "In DriveHandler::processTimeout(): Killed subprocess."); } catch (exception::Exception & ex) { params.add("Error", ex.getMessageValue()); - m_processManager.logContext().log(log::INFO, "Failed to kill() subprocess."); + m_processManager.logContext().log(log::ERR, "In DriveHandler::processTimeout(): Failed to kill subprocess."); } // We now should receive the sigchild, so we ask nothing from process manager m_processingStatus.nextTimeout=m_processingStatus.nextTimeout.max(); diff --git a/tapeserver/daemon/DriveHandler.hpp b/tapeserver/daemon/DriveHandler.hpp index 3a4ffda58cf096d681903ad9b8397466456eedcc..4d1ee06a3cc1bef3cd7ccc13aa1d232de5e85eb7 100644 --- a/tapeserver/daemon/DriveHandler.hpp +++ b/tapeserver/daemon/DriveHandler.hpp @@ -116,6 +116,12 @@ private: std::chrono::time_point<std::chrono::steady_clock> m_lastHeartBeatTime=decltype(m_lastHeartBeatTime)::min(); /** When did we see the last data movement? */ std::chrono::time_point<std::chrono::steady_clock> m_lastDataMovementTime=decltype(m_lastDataMovementTime)::min(); + /** Type of the currently active timeout */ + std::string m_timeoutType; + /** Session type when currently active timeout was set */ + session::SessionType m_sessionTypeWhenTimeoutDecided; + /** State when current timeout was set */ + session::SessionState m_sessionStateWhenTimeoutDecided; /** Computation of the next timeout (depending on the state) */ decltype (SubprocessHandler::ProcessingStatus::nextTimeout) nextTimeout(); /** How much data did we see moving the session so far? (tape) */ diff --git a/tapeserver/daemon/ProcessManager.cpp b/tapeserver/daemon/ProcessManager.cpp index 719cb69fe5d2e3e9a3f20c725c5a8612f6ae76af..bf2bbe848afe3de2c0e85279fe5d83744d44973d 100644 --- a/tapeserver/daemon/ProcessManager.cpp +++ b/tapeserver/daemon/ProcessManager.cpp @@ -248,8 +248,13 @@ void ProcessManager::runEventLoop() { // If the handler requested kill, shutdown or fork, we can go back to handlers, // which means we exit from the loop here. if (sp.status.forkRequested || sp.status.killRequested || sp.status.shutdownRequested || sp.status.sigChild) return; - // If new timeout is still in the past, we overlook it + // If new timeout is still in the past, we overlook it (but log it) if (sp.status.nextTimeout < std::chrono::steady_clock::now()) { + log::ScopedParamContainer params(m_logContext); + params.add("now", std::chrono::duration_cast<std::chrono::seconds>(std::chrono::steady_clock::now().time_since_epoch()).count()) + .add("subprocess", sp.handler->index) + .add("timeout", std::chrono::duration_cast<std::chrono::seconds>(sp.status.nextTimeout.time_since_epoch()).count()); + m_logContext.log(log::ERR, "In ProcessManager::runEventLoop(): got twice a timeout in the past. Skipping."); continue; } }