From 104242a5e2d444db07b8a8bb40aeb4cde58fff98 Mon Sep 17 00:00:00 2001
From: Martin Hierholzer <martin.hierholzer@desy.de>
Date: Thu, 9 Apr 2020 16:44:44 +0200
Subject: [PATCH] wip #129: rearrange DeviceModule exception handling loop

---
 src/DeviceModule.cc | 177 ++++++++++++++++++--------------------------
 1 file changed, 70 insertions(+), 107 deletions(-)

diff --git a/src/DeviceModule.cc b/src/DeviceModule.cc
index c76830ca..0b0b1baf 100644
--- a/src/DeviceModule.cc
+++ b/src/DeviceModule.cc
@@ -277,57 +277,87 @@ namespace ChimeraTK {
     Application::registerThread("DM_" + getName());
     std::string error;
     owner->testableModeLock("Startup");
+
+    // catch-all required around everything, to prevent any uncaught exception (including the boost thread interruption)
+    // to terminate this thread before waking up other threads waiting on the condition variable.
     try {
-      while(!device.isOpened()) {
-        try {
-          boost::this_thread::interruption_point();
+      while(true) {
+        //  [Spec: 2.3.1] (Re)-open the device.
+        owner->testableModeUnlock("Open/recover device");
+        do {
           usleep(500000);
-          device.open();
-          if(deviceError.status != 0) {
-            deviceError.status = 0;
-            deviceError.message = "";
-            deviceError.setCurrentVersionNumber({});
-            deviceError.writeAll();
+          boost::this_thread::interruption_point();
+          try {
+            device.open();
           }
-        }
-        catch(ChimeraTK::runtime_error& e) {
-          if(deviceError.status != 1) {
-            deviceError.status = 1;
-            deviceError.message = e.what();
-            deviceError.setCurrentVersionNumber({});
-            deviceError.writeAll();
+          catch(ChimeraTK::runtime_error& e) {
+            if(deviceError.status != 1) {
+              // deviceError.status == 0 if the initial open fails (no error yet reported)
+              deviceError.status = 1;
+              deviceError.message = e.what();
+              deviceError.setCurrentVersionNumber({});
+              deviceError.writeAll();
+            }
+            continue; // should not be necessary because isFunctional() should return false. But no harm in leaving it in.
+          }
+        } while(!device.isFunctional());
+        owner->testableModeLock("Initialise device");
+
+        std::unique_lock<std::mutex> errorLock(errorMutex);
+
+        // [Spec: 2.3.2] Run initialisation handlers
+        try {
+          for(auto& initHandler : initialisationHandlers) {
+            initHandler(this);
           }
         }
-      }
+        catch(ChimeraTK::runtime_error&) {
+          // Jump back to re-opening the device
+          continue;
+        }
 
-      // The device was successfully opened, try to initialise it
-      try {
-        for(auto& initHandler : initialisationHandlers) {
-          initHandler(this);
+        // [Spec: 2.3.3] Empty exception reporting queue.
+        while(errorQueue.pop()) {
+          if(owner->isTestableModeEnabled()) {
+            assert(owner->testableMode_counter > 0);
+            --owner->testableMode_counter;
+          }
         }
-        for(auto& te : writeRecoveryOpen) {
-          if(te->getVersionNumber() != VersionNumber{nullptr}) {
-            te->write();
+
+        // [Spec: 2.3.4] Write all recovery accessors
+        try {
+          boost::unique_lock<boost::shared_mutex> uniqueLock(recoverySharedMutex);
+          for(auto& te : writeRecoveryOpen) {
+            if(te->getVersionNumber() != VersionNumber{nullptr}) {
+              te->write();
+            }
           }
         }
-      }
-      catch(ChimeraTK::runtime_error& e) {
-        // we just report the exception and let the exception handling loop do the rest
-        std::lock_guard<std::mutex> locakGuard(errorMutex);
-        deviceHasError = true;
-        if(errorQueue.push(e.what())) {
-          if(owner->isTestableModeEnabled()) ++owner->testableMode_counter;
+        catch(ChimeraTK::runtime_error&) {
+          // Jump back to re-opening the device
+          continue;
         }
-      }
 
-      while(true) {
-        // We need one interruption point at the beginning of the loop. It is always executed and is not blocked by a lock (except the testable mode lock)
-        boost::this_thread::interruption_point();
+        // [Spec: 2.3.5] Reset exception state and wait for the next error to be reported.
+        deviceError.status = 0;
+        deviceError.message = "";
+        deviceError.setCurrentVersionNumber({});
+        deviceError.writeAll();
+
+        deviceHasError = false;
+
+        // [Spec: 2.3.6+2.3.7] send the trigger that the device is available again
+        deviceBecameFunctional.write();
+        errorLock.unlock();
+        errorIsResolvedCondVar.notify_all();
+
+        // [Spec: 2.3.8] Wait for an exception being reported by the ExceptionHandlingDecorators
         // release the testable mode mutex for waiting for the exception.
         owner->testableModeUnlock("Wait for exception");
-        // Do not modify the queue without holding the testable mode lock, because we also consitenly have to modify the counter protected by that mutex.
+        // Do not modify the queue without holding the testable mode lock, because we also consistently have to modify
+        // the counter protected by that mutex.
         // Just check the condition variable.
-        std::unique_lock<std::mutex> errorLock(errorMutex);
+        errorLock.lock();
         while(!deviceHasError) {
           boost::this_thread::interruption_point(); // Make sure not to start waiting for the condition variable if
                                                     // interruption was requested.
@@ -347,82 +377,15 @@ namespace ChimeraTK {
           --owner->testableMode_counter;
         }
 
-        // report exception to the control system
+        // [Spec: 2.6.1] report exception to the control system
         deviceError.status = 1;
         deviceError.message = error;
         deviceError.setCurrentVersionNumber({});
         deviceError.writeAll();
 
-        // wait some time until retrying.
-        // Never sleep when holding a lock
-        errorLock.unlock(); // we must not hold the error lock when not having the testable mode mutex
-        owner->testableModeUnlock("Wait for recovery");
+        // TODO Implementation for Spec 2.6.2 goes here.
 
-        do {
-          usleep(500000);
-          boost::this_thread::interruption_point();
-          try {
-            // This triggers a recovery (device is already opened).
-            device.open();
-          }
-          catch(ChimeraTK::runtime_error&) {
-            // Recovery failed. Just catch the exception and retry.
-            continue; // should not be necessary because isFunctional() should return false. But no harm in leaving it in.
-          }
-        } while(!device.isFunctional());
-
-        owner->testableModeLock("Try recovery");
-        errorLock.lock();
-
-        // FIXME: we have to wait here until the device reports that it is functional again.
-        // Otherwise we spam the error reporting with 2 Hz.
-
-        try {
-          // re-initialise the device before continuing
-          for(auto& initHandler : initialisationHandlers) {
-            initHandler(this);
-          }
-          { // scope for the lock guard
-            boost::unique_lock<boost::shared_mutex> uniqueLock(recoverySharedMutex);
-            for(auto& te : writeRecoveryOpen) {
-              if(te->getVersionNumber() != VersionNumber{nullptr}) {
-                te->write();
-              }
-            }
-          } // end of scope for the lock guard
-        }
-        catch(ChimeraTK::runtime_error& e) {
-          // Report the error. This puts the exception to the queue and we can continue with waiting for the queue.
-          // It is sure we will enter it again because we just pushed to it, so if  the device recovers we will notice.
-          // Do not use reportError, which would just do the mutext business in addition to pushing to the queue, but we already have the mutex.
-          if(errorQueue.push(e.what())) {
-            if(owner->isTestableModeEnabled()) ++owner->testableMode_counter;
-          }
-          continue;
-        }
-
-        // We survived the initialisation (if any). It seems the device is working again.
-        // Empty exception reporting queue.
-        while(errorQueue.pop()) {
-          if(owner->isTestableModeEnabled()) {
-            assert(owner->testableMode_counter > 0);
-            --owner->testableMode_counter;
-          }
-        }
-        // Reset exception state and wait for the next error to be reported.
-        deviceError.status = 0;
-        deviceError.message = "";
-        deviceError.setCurrentVersionNumber({});
-        deviceError.writeAll();
-
-        deviceHasError = false;
-
-        // send the trigger that the device is available again
-        deviceBecameFunctional.write();
-
-        errorLock.unlock();
-        errorIsResolvedCondVar.notify_all();
-      }
+      } // while(true)
     }
     catch(...) {
       // before we leave this thread, we might need to notify other waiting
-- 
GitLab