PR to track down CI failures (#501)

* Use an extended timeout for DKGs specifically * Add a log statement when message-queue connection fails * Add a 60 second keep-alive to connections * Use zalloc for processor/message-queue/coordinator An additional layer which protects us against edge cases with Zeroizing (objects which don't support it or don't miss it). * Add further logs to message-queue * Further increase re-attempt timeouts in CI * Remove misplaced continue inmessage-queue client Fixes observed CI failures. * Revert "Further increase re-attempt timeouts in CI" This reverts commit 3723530cf6.
2025-12-08 12:19:24 +00:00 · 2024-01-04 01:08:13 -05:00
parent 6c8040f723
commit 7eb388e546
10 changed files with 63 additions and 15 deletions
--- a/coordinator/src/main.rs
+++ b/coordinator/src/main.rs
@@ -63,6 +63,10 @@ use cosign_evaluator::CosignEvaluator;
 #[cfg(test)]
 pub mod tests;

+#[global_allocator]
+static ALLOCATOR: zalloc::ZeroizingAlloc<std::alloc::System> =
+  zalloc::ZeroizingAlloc(std::alloc::System);
+
 #[derive(Clone)]
 pub struct ActiveTributary<D: Db, P: P2p> {
  pub spec: TributarySpec,
--- a/coordinator/src/tributary/db.rs
+++ b/coordinator/src/tributary/db.rs
@@ -141,12 +141,16 @@ impl ReattemptDb {
    // 5 minutes for attempts 0 ..= 2, 10 minutes for attempts 3 ..= 5, 15 minutes for attempts > 5
    // Assumes no event will take longer than 15 minutes, yet grows the time in case there are
    // network bandwidth issues
-    let reattempt_delay = BASE_REATTEMPT_DELAY *
+    let mut reattempt_delay = BASE_REATTEMPT_DELAY *
      ((AttemptDb::attempt(txn, genesis, topic)
        .expect("scheduling re-attempt for unknown topic") /
        3) +
        1)
      .min(3);
+    // Allow more time for DKGs since they have an extra round and much more data
+    if matches!(topic, Topic::Dkg) {
+      reattempt_delay *= 4;
+    }
    let upon_block = current_block_number + reattempt_delay;

    let mut reattempts = Self::get(txn, genesis, upon_block).unwrap_or(vec![]);