PR to track down CI failures (#501)

* Use an extended timeout for DKGs specifically

* Add a log statement when message-queue connection fails

* Add a 60 second keep-alive to connections

* Use zalloc for processor/message-queue/coordinator

An additional layer which protects us against edge cases with Zeroizing
(objects which don't support it or don't miss it).

* Add further logs to message-queue

* Further increase re-attempt timeouts in CI

* Remove misplaced continue inmessage-queue client

Fixes observed CI failures.

* Revert "Further increase re-attempt timeouts in CI"

This reverts commit 3723530cf6.
This commit is contained in:
Luke Parker
2024-01-04 01:08:13 -05:00
committed by GitHub
parent 6c8040f723
commit 7eb388e546
10 changed files with 63 additions and 15 deletions

View File

@@ -63,6 +63,10 @@ use cosign_evaluator::CosignEvaluator;
#[cfg(test)]
pub mod tests;
#[global_allocator]
static ALLOCATOR: zalloc::ZeroizingAlloc<std::alloc::System> =
zalloc::ZeroizingAlloc(std::alloc::System);
#[derive(Clone)]
pub struct ActiveTributary<D: Db, P: P2p> {
pub spec: TributarySpec,

View File

@@ -141,12 +141,16 @@ impl ReattemptDb {
// 5 minutes for attempts 0 ..= 2, 10 minutes for attempts 3 ..= 5, 15 minutes for attempts > 5
// Assumes no event will take longer than 15 minutes, yet grows the time in case there are
// network bandwidth issues
let reattempt_delay = BASE_REATTEMPT_DELAY *
let mut reattempt_delay = BASE_REATTEMPT_DELAY *
((AttemptDb::attempt(txn, genesis, topic)
.expect("scheduling re-attempt for unknown topic") /
3) +
1)
.min(3);
// Allow more time for DKGs since they have an extra round and much more data
if matches!(topic, Topic::Dkg) {
reattempt_delay *= 4;
}
let upon_block = current_block_number + reattempt_delay;
let mut reattempts = Self::get(txn, genesis, upon_block).unwrap_or(vec![]);