mirror of
https://github.com/serai-dex/serai.git
synced 2025-12-08 12:19:24 +00:00
Reattempts (#483)
* Schedule re-attempts and add a (not filled out) match statement to actually execute them A comment explains the methodology. To copy it here: """ This is because we *always* re-attempt any protocol which had participation. That doesn't mean we *should* re-attempt this protocol. The alternatives were: 1) Note on-chain we completed a protocol, halting re-attempts upon 34%. 2) Vote on-chain to re-attempt a protocol. This schema doesn't have any additional messages upon the success case (whereas alternative #1 does) and doesn't have overhead (as alternative #2 does, sending votes and then preprocesses. This only sends preprocesses). """ Any signing protocol which reaches sufficient participation will be re-attempted until it no longer does. * Have the Substrate scanner track DKG removals/completions for the Tributary code * Don't keep trying to publish a participant removal if we've already set keys * Pad out the re-attempt match a bit more * Have CosignEvaluator reload from the DB * Correctly schedule cosign re-attempts * Actuall spawn new DKG removal attempts * Use u32 for Batch ID in SubstrateSignableId, finish Batch re-attempt routing The batch ID was an opaque [u8; 5] which also included the network, yet that's redundant and unhelpful. * Clarify a pair of TODOs in the coordinator * Remove old TODO * Final comment cleanup * Correct usage of TARGET_BLOCK_TIME in reattempt scheduler It's in ms and I assumed it was in s. * Have coordinator tests drop BatchReattempts which aren't relevant yet may exist * Bug fix and pointless oddity removal We scheduled a re-attempt upon receiving 2/3rds of preprocesses and upon receiving 2/3rds of shares, so any signing protocol could cause two re-attempts (not one more). The coordinator tests randomly generated the Batch ID since it was prior an opaque byte array. While that didn't break the test, it was pointless and did make the already-succeeded check before re-attempting impossible to hit. * Add log statements, correct dead-lock in coordinator tests * Increase pessimistic timeout on recv_message to compensate for tighter best-case timeouts * Further bump timeout by a minute AFAICT, GH failed by just a few seconds. This also is worst-case in a single instance, making it fine to be decently long. * Further further bump timeout due to lack of distinct error
This commit is contained in:
@@ -168,7 +168,7 @@ pub mod coordinator {
|
||||
)]
|
||||
pub enum SubstrateSignableId {
|
||||
CosigningSubstrateBlock([u8; 32]),
|
||||
Batch([u8; 5]),
|
||||
Batch(u32),
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash, Debug, Encode, Decode, BorshSerialize, BorshDeserialize)]
|
||||
|
||||
@@ -16,7 +16,6 @@ use frost_schnorrkel::Schnorrkel;
|
||||
|
||||
use log::{info, debug, warn};
|
||||
|
||||
use scale::Encode;
|
||||
use serai_client::{
|
||||
primitives::{NetworkId, BlockHash},
|
||||
in_instructions::primitives::{Batch, SignedBatch, batch_message},
|
||||
@@ -26,15 +25,10 @@ use serai_client::{
|
||||
use messages::coordinator::*;
|
||||
use crate::{Get, DbTxn, Db, create_db};
|
||||
|
||||
// Generate an ID unique to a Batch
|
||||
fn batch_sign_id(network: NetworkId, id: u32) -> [u8; 5] {
|
||||
(network, id).encode().try_into().unwrap()
|
||||
}
|
||||
|
||||
create_db!(
|
||||
BatchSignerDb {
|
||||
CompletedDb: (id: [u8; 5]) -> (),
|
||||
AttemptDb: (id: [u8; 5], attempt: u32) -> (),
|
||||
CompletedDb: (id: u32) -> (),
|
||||
AttemptDb: (id: u32, attempt: u32) -> (),
|
||||
BatchDb: (block: BlockHash) -> SignedBatch
|
||||
}
|
||||
);
|
||||
@@ -51,14 +45,12 @@ pub struct BatchSigner<D: Db> {
|
||||
session: Session,
|
||||
keys: Vec<ThresholdKeys<Ristretto>>,
|
||||
|
||||
signable: HashMap<[u8; 5], Batch>,
|
||||
attempt: HashMap<[u8; 5], u32>,
|
||||
signable: HashMap<u32, Batch>,
|
||||
attempt: HashMap<u32, u32>,
|
||||
#[allow(clippy::type_complexity)]
|
||||
preprocessing:
|
||||
HashMap<[u8; 5], (Vec<AlgorithmSignMachine<Ristretto, Schnorrkel>>, Vec<Preprocess>)>,
|
||||
preprocessing: HashMap<u32, (Vec<AlgorithmSignMachine<Ristretto, Schnorrkel>>, Vec<Preprocess>)>,
|
||||
#[allow(clippy::type_complexity)]
|
||||
signing:
|
||||
HashMap<[u8; 5], (AlgorithmSignatureMachine<Ristretto, Schnorrkel>, Vec<SignatureShare>)>,
|
||||
signing: HashMap<u32, (AlgorithmSignatureMachine<Ristretto, Schnorrkel>, Vec<SignatureShare>)>,
|
||||
}
|
||||
|
||||
impl<D: Db> fmt::Debug for BatchSigner<D> {
|
||||
@@ -92,7 +84,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
}
|
||||
}
|
||||
|
||||
fn verify_id(&self, id: &SubstrateSignId) -> Result<(Session, [u8; 5], u32), ()> {
|
||||
fn verify_id(&self, id: &SubstrateSignId) -> Result<(Session, u32, u32), ()> {
|
||||
let SubstrateSignId { session, id, attempt } = id;
|
||||
let SubstrateSignableId::Batch(id) = id else { panic!("BatchSigner handed non-Batch") };
|
||||
|
||||
@@ -104,17 +96,12 @@ impl<D: Db> BatchSigner<D> {
|
||||
// rebooted OR we detected the signed batch on chain
|
||||
// The latter is the expected flow for batches not actively being participated in
|
||||
None => {
|
||||
warn!("not attempting batch {} #{}", hex::encode(id), attempt);
|
||||
warn!("not attempting batch {id} #{attempt}");
|
||||
Err(())?;
|
||||
}
|
||||
Some(our_attempt) => {
|
||||
if attempt != our_attempt {
|
||||
warn!(
|
||||
"sent signing data for batch {} #{} yet we have attempt #{}",
|
||||
hex::encode(id),
|
||||
attempt,
|
||||
attempt
|
||||
);
|
||||
warn!("sent signing data for batch {id} #{attempt} yet we have attempt #{our_attempt}");
|
||||
Err(())?;
|
||||
}
|
||||
}
|
||||
@@ -127,7 +114,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
async fn attempt(
|
||||
&mut self,
|
||||
txn: &mut D::Transaction<'_>,
|
||||
id: [u8; 5],
|
||||
id: u32,
|
||||
attempt: u32,
|
||||
) -> Option<ProcessorMessage> {
|
||||
// See above commentary for why this doesn't emit SignedBatch
|
||||
@@ -138,12 +125,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
// Check if we're already working on this attempt
|
||||
if let Some(curr_attempt) = self.attempt.get(&id) {
|
||||
if curr_attempt >= &attempt {
|
||||
warn!(
|
||||
"told to attempt {} #{} yet we're already working on {}",
|
||||
hex::encode(id),
|
||||
attempt,
|
||||
curr_attempt
|
||||
);
|
||||
warn!("told to attempt {id} #{attempt} yet we're already working on {curr_attempt}");
|
||||
return None;
|
||||
}
|
||||
}
|
||||
@@ -163,7 +145,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
// Update the attempt number
|
||||
self.attempt.insert(id, attempt);
|
||||
|
||||
info!("signing batch {} #{}", hex::encode(id), attempt);
|
||||
info!("signing batch {id} #{attempt}");
|
||||
|
||||
// If we reboot mid-sign, the current design has us abort all signs and wait for latter
|
||||
// attempts/new signing protocols
|
||||
@@ -180,9 +162,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
// TODO: This isn't complete as this txn may not be committed with the expected timing
|
||||
if AttemptDb::get(txn, id, attempt).is_some() {
|
||||
warn!(
|
||||
"already attempted batch {}, attempt #{}. this is an error if we didn't reboot",
|
||||
hex::encode(id),
|
||||
attempt
|
||||
"already attempted batch {id}, attempt #{attempt}. this is an error if we didn't reboot"
|
||||
);
|
||||
return None;
|
||||
}
|
||||
@@ -215,7 +195,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
batch: Batch,
|
||||
) -> Option<ProcessorMessage> {
|
||||
debug_assert_eq!(self.network, batch.network);
|
||||
let id = batch_sign_id(batch.network, batch.id);
|
||||
let id = batch.id;
|
||||
if CompletedDb::get(txn, id).is_some() {
|
||||
debug!("Sign batch order for ID we've already completed signing");
|
||||
// See batch_signed for commentary on why this simply returns
|
||||
@@ -246,10 +226,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
let (machines, our_preprocesses) = match self.preprocessing.remove(&id) {
|
||||
// Either rebooted or RPC error, or some invariant
|
||||
None => {
|
||||
warn!(
|
||||
"not preprocessing for {}. this is an error if we didn't reboot",
|
||||
hex::encode(id),
|
||||
);
|
||||
warn!("not preprocessing for {id}. this is an error if we didn't reboot");
|
||||
return None;
|
||||
}
|
||||
Some(preprocess) => preprocess,
|
||||
@@ -344,10 +321,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
panic!("never preprocessed yet signing?");
|
||||
}
|
||||
|
||||
warn!(
|
||||
"not preprocessing for {}. this is an error if we didn't reboot",
|
||||
hex::encode(id)
|
||||
);
|
||||
warn!("not preprocessing for {id}. this is an error if we didn't reboot");
|
||||
return None;
|
||||
}
|
||||
Some(signing) => signing,
|
||||
@@ -399,7 +373,7 @@ impl<D: Db> BatchSigner<D> {
|
||||
},
|
||||
};
|
||||
|
||||
info!("signed batch {} with attempt #{}", hex::encode(id), attempt);
|
||||
info!("signed batch {id} with attempt #{attempt}");
|
||||
|
||||
let batch =
|
||||
SignedBatch { batch: self.signable.remove(&id).unwrap(), signature: sig.into() };
|
||||
@@ -426,15 +400,13 @@ impl<D: Db> BatchSigner<D> {
|
||||
}
|
||||
|
||||
pub fn batch_signed(&mut self, txn: &mut D::Transaction<'_>, id: u32) {
|
||||
let sign_id = batch_sign_id(self.network, id);
|
||||
|
||||
// Stop trying to sign for this batch
|
||||
CompletedDb::set(txn, sign_id, &());
|
||||
CompletedDb::set(txn, id, &());
|
||||
|
||||
self.signable.remove(&sign_id);
|
||||
self.attempt.remove(&sign_id);
|
||||
self.preprocessing.remove(&sign_id);
|
||||
self.signing.remove(&sign_id);
|
||||
self.signable.remove(&id);
|
||||
self.attempt.remove(&id);
|
||||
self.preprocessing.remove(&id);
|
||||
self.signing.remove(&id);
|
||||
|
||||
// This doesn't emit SignedBatch because it doesn't have access to the SignedBatch
|
||||
// This function is expected to only be called once Substrate acknowledges this block,
|
||||
|
||||
@@ -13,7 +13,6 @@ use sp_application_crypto::{RuntimePublic, sr25519::Public};
|
||||
|
||||
use serai_db::{DbTxn, Db, MemDb};
|
||||
|
||||
use scale::Encode;
|
||||
#[rustfmt::skip]
|
||||
use serai_client::{primitives::*, in_instructions::primitives::*, validator_sets::primitives::Session};
|
||||
|
||||
@@ -49,11 +48,8 @@ async fn test_batch_signer() {
|
||||
],
|
||||
};
|
||||
|
||||
let actual_id = SubstrateSignId {
|
||||
session: Session(0),
|
||||
id: SubstrateSignableId::Batch((batch.network, batch.id).encode().try_into().unwrap()),
|
||||
attempt: 0,
|
||||
};
|
||||
let actual_id =
|
||||
SubstrateSignId { session: Session(0), id: SubstrateSignableId::Batch(batch.id), attempt: 0 };
|
||||
|
||||
let mut signing_set = vec![];
|
||||
while signing_set.len() < usize::from(keys.values().next().unwrap().params().t()) {
|
||||
|
||||
Reference in New Issue
Block a user