Files
serai/coordinator/src/tests/tributary/sync.rs

162 lines
5.4 KiB
Rust
Raw Normal View History

use core::time::Duration;
use std::{sync::Arc, collections::HashSet};
use rand_core::OsRng;
use ciphersuite::{group::GroupEncoding, Ciphersuite, Ristretto};
Add a cosigning protocol to ensure finalizations are unique (#433) * Add a function to deterministically decide which Serai blocks should be co-signed Has a 5 minute latency between co-signs, also used as the maximal latency before a co-sign is started. * Get all active tributaries we're in at a specific block * Add and route CosignSubstrateBlock, a new provided TX * Split queued cosigns per network * Rename BatchSignId to SubstrateSignId * Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it * Handle the CosignSubstrateBlock provided TX * Revert substrate_signer.rs to develop (and patch to still work) Due to SubstrateSigner moving when the prior multisig closes, yet cosigning occurring with the most recent key, a single SubstrateSigner can be reused. We could manage multiple SubstrateSigners, yet considering the much lower specifications for cosigning, I'd rather treat it distinctly. * Route cosigning through the processor * Add note to rename SubstrateSigner post-PR I don't want to do so now in order to preserve the diff's clarity. * Implement cosign evaluation into the coordinator * Get tests to compile * Bug fixes, mark blocks without cosigners available as cosigned * Correct the ID Batch preprocesses are saved under, add log statements * Create a dedicated function to handle cosigns * Correct the flow around Batch verification/queueing Verifying `Batch`s could stall when a `Batch` was signed before its predecessors/before the block it's contained in was cosigned (the latter being inevitable as we can't sign a block containing a signed batch before signing the batch). Now, Batch verification happens on a distinct async task in order to not block the handling of processor messages. This task is the sole caller of verify in order to ensure last_verified_batch isn't unexpectedly mutated. When the processor message handler needs to access it, or needs to queue a Batch, it associates the DB TXN with a lock preventing the other task from doing so. This lock, as currently implemented, is a poor and inefficient design. It should be modified to the pattern used for cosign management. Additionally, a new primitive of a DB-backed channel may be immensely valuable. Fixes a standing potential deadlock and a deadlock introduced with the cosigning protocol. * Working full-stack tests After the last commit, this only required extending a timeout. * Replace "co-sign" with "cosign" to make finding text easier * Update the coordinator tests to support cosigning * Inline prior_batch calculation to prevent panic on rotation Noticed when doing a final review of the branch.
2023-11-15 16:57:21 -05:00
use tokio::{
sync::{mpsc, broadcast},
time::sleep,
};
use serai_db::MemDb;
use tributary::Tributary;
use crate::{
tributary::Transaction,
ActiveTributary, TributaryEvent,
p2p::{heartbeat_tributaries_task, handle_p2p_task},
tests::{
LocalP2p,
tributary::{new_keys, new_spec, new_tributaries},
},
};
#[tokio::test]
async fn sync_test() {
let mut keys = new_keys(&mut OsRng);
let spec = new_spec(&mut OsRng, &keys);
// Ensure this can have a node fail
assert!(spec.n() > spec.t());
let mut tributaries = new_tributaries(&keys, &spec).await;
// Keep a Tributary back, effectively having it offline
let syncer_key = keys.pop().unwrap();
let (syncer_p2p, syncer_tributary) = tributaries.pop().unwrap();
// Have the rest form a P2P net
let mut tributary_senders = vec![];
let mut tributary_arcs = vec![];
2023-06-28 20:02:57 +03:00
let mut p2p_threads = vec![];
for (p2p, tributary) in tributaries.drain(..) {
let tributary = Arc::new(tributary);
tributary_arcs.push(tributary.clone());
let (new_tributary_send, new_tributary_recv) = broadcast::channel(5);
Add a cosigning protocol to ensure finalizations are unique (#433) * Add a function to deterministically decide which Serai blocks should be co-signed Has a 5 minute latency between co-signs, also used as the maximal latency before a co-sign is started. * Get all active tributaries we're in at a specific block * Add and route CosignSubstrateBlock, a new provided TX * Split queued cosigns per network * Rename BatchSignId to SubstrateSignId * Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it * Handle the CosignSubstrateBlock provided TX * Revert substrate_signer.rs to develop (and patch to still work) Due to SubstrateSigner moving when the prior multisig closes, yet cosigning occurring with the most recent key, a single SubstrateSigner can be reused. We could manage multiple SubstrateSigners, yet considering the much lower specifications for cosigning, I'd rather treat it distinctly. * Route cosigning through the processor * Add note to rename SubstrateSigner post-PR I don't want to do so now in order to preserve the diff's clarity. * Implement cosign evaluation into the coordinator * Get tests to compile * Bug fixes, mark blocks without cosigners available as cosigned * Correct the ID Batch preprocesses are saved under, add log statements * Create a dedicated function to handle cosigns * Correct the flow around Batch verification/queueing Verifying `Batch`s could stall when a `Batch` was signed before its predecessors/before the block it's contained in was cosigned (the latter being inevitable as we can't sign a block containing a signed batch before signing the batch). Now, Batch verification happens on a distinct async task in order to not block the handling of processor messages. This task is the sole caller of verify in order to ensure last_verified_batch isn't unexpectedly mutated. When the processor message handler needs to access it, or needs to queue a Batch, it associates the DB TXN with a lock preventing the other task from doing so. This lock, as currently implemented, is a poor and inefficient design. It should be modified to the pattern used for cosign management. Additionally, a new primitive of a DB-backed channel may be immensely valuable. Fixes a standing potential deadlock and a deadlock introduced with the cosigning protocol. * Working full-stack tests After the last commit, this only required extending a timeout. * Replace "co-sign" with "cosign" to make finding text easier * Update the coordinator tests to support cosigning * Inline prior_batch calculation to prevent panic on rotation Noticed when doing a final review of the branch.
2023-11-15 16:57:21 -05:00
let (cosign_send, _) = mpsc::unbounded_channel();
let thread = tokio::spawn(handle_p2p_task(p2p, cosign_send, new_tributary_recv));
new_tributary_send
.send(TributaryEvent::NewTributary(ActiveTributary { spec: spec.clone(), tributary }))
.map_err(|_| "failed to send ActiveTributary")
.unwrap();
tributary_senders.push(new_tributary_send);
2023-06-28 20:02:57 +03:00
p2p_threads.push(thread);
}
let tributaries = tributary_arcs;
2023-10-13 21:46:26 -04:00
// After four blocks of time, we should have a new block
// We don't wait one block of time as we may have missed the chance for the first block
// We don't wait two blocks because we may have missed the chance, and then had a failure to
2023-10-13 21:46:26 -04:00
// propose by our 'offline' validator, which would cause the Tendermint round time to increase,
// requiring a longer delay
let block_time = u64::from(Tributary::<MemDb, Transaction, LocalP2p>::block_time());
2023-10-13 21:46:26 -04:00
sleep(Duration::from_secs(4 * block_time)).await;
let tip = tributaries[0].tip().await;
assert!(tip != spec.genesis());
// Sleep one second to make sure this block propagates
sleep(Duration::from_secs(1)).await;
// Make sure every tributary has it
for tributary in &tributaries {
assert!(tributary.reader().block(&tip).is_some());
}
// Now that we've confirmed the other tributaries formed a net without issue, drop the syncer's
// pending P2P messages
syncer_p2p.1.write().await.1.last_mut().unwrap().clear();
// Have it join the net
let syncer_key = Ristretto::generator() * *syncer_key;
let syncer_tributary = Arc::new(syncer_tributary);
let (syncer_tributary_send, syncer_tributary_recv) = broadcast::channel(5);
Add a cosigning protocol to ensure finalizations are unique (#433) * Add a function to deterministically decide which Serai blocks should be co-signed Has a 5 minute latency between co-signs, also used as the maximal latency before a co-sign is started. * Get all active tributaries we're in at a specific block * Add and route CosignSubstrateBlock, a new provided TX * Split queued cosigns per network * Rename BatchSignId to SubstrateSignId * Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it * Handle the CosignSubstrateBlock provided TX * Revert substrate_signer.rs to develop (and patch to still work) Due to SubstrateSigner moving when the prior multisig closes, yet cosigning occurring with the most recent key, a single SubstrateSigner can be reused. We could manage multiple SubstrateSigners, yet considering the much lower specifications for cosigning, I'd rather treat it distinctly. * Route cosigning through the processor * Add note to rename SubstrateSigner post-PR I don't want to do so now in order to preserve the diff's clarity. * Implement cosign evaluation into the coordinator * Get tests to compile * Bug fixes, mark blocks without cosigners available as cosigned * Correct the ID Batch preprocesses are saved under, add log statements * Create a dedicated function to handle cosigns * Correct the flow around Batch verification/queueing Verifying `Batch`s could stall when a `Batch` was signed before its predecessors/before the block it's contained in was cosigned (the latter being inevitable as we can't sign a block containing a signed batch before signing the batch). Now, Batch verification happens on a distinct async task in order to not block the handling of processor messages. This task is the sole caller of verify in order to ensure last_verified_batch isn't unexpectedly mutated. When the processor message handler needs to access it, or needs to queue a Batch, it associates the DB TXN with a lock preventing the other task from doing so. This lock, as currently implemented, is a poor and inefficient design. It should be modified to the pattern used for cosign management. Additionally, a new primitive of a DB-backed channel may be immensely valuable. Fixes a standing potential deadlock and a deadlock introduced with the cosigning protocol. * Working full-stack tests After the last commit, this only required extending a timeout. * Replace "co-sign" with "cosign" to make finding text easier * Update the coordinator tests to support cosigning * Inline prior_batch calculation to prevent panic on rotation Noticed when doing a final review of the branch.
2023-11-15 16:57:21 -05:00
let (cosign_send, _) = mpsc::unbounded_channel();
tokio::spawn(handle_p2p_task(syncer_p2p.clone(), cosign_send, syncer_tributary_recv));
syncer_tributary_send
.send(TributaryEvent::NewTributary(ActiveTributary {
spec: spec.clone(),
tributary: syncer_tributary.clone(),
}))
.map_err(|_| "failed to send ActiveTributary to syncer")
.unwrap();
// It shouldn't automatically catch up. If it somehow was, our test would be broken
// Sanity check this
let tip = tributaries[0].tip().await;
// Wait until a new block occurs
sleep(Duration::from_secs(3 * block_time)).await;
// Make sure a new block actually occurred
assert!(tributaries[0].tip().await != tip);
// Make sure the new block alone didn't trigger catching up
assert_eq!(syncer_tributary.tip().await, spec.genesis());
// Start the heartbeat protocol
let (syncer_heartbeat_tributary_send, syncer_heartbeat_tributary_recv) = broadcast::channel(5);
tokio::spawn(heartbeat_tributaries_task(syncer_p2p, syncer_heartbeat_tributary_recv));
syncer_heartbeat_tributary_send
.send(TributaryEvent::NewTributary(ActiveTributary {
spec: spec.clone(),
tributary: syncer_tributary.clone(),
}))
.map_err(|_| "failed to send ActiveTributary to heartbeat")
.unwrap();
// The heartbeat is once every 10 blocks
sleep(Duration::from_secs(10 * block_time)).await;
assert!(syncer_tributary.tip().await != spec.genesis());
// Verify it synced to the tip
let syncer_tip = {
let tributary = &tributaries[0];
let tip = tributary.tip().await;
let syncer_tip = syncer_tributary.tip().await;
// Allow a one block tolerance in case of race conditions
assert!(
HashSet::from([tip, tributary.reader().block(&tip).unwrap().parent()]).contains(&syncer_tip)
);
syncer_tip
};
sleep(Duration::from_secs(block_time)).await;
// Verify it's now keeping up
assert!(syncer_tributary.tip().await != syncer_tip);
// Verify it's now participating in consensus
2023-06-28 20:02:57 +03:00
// Because only `t` validators are used in a commit, take n - t nodes offline
// leaving only `t` nodes. Which should force it to participate in the consensus
// of next blocks.
let spares = usize::from(spec.n() - spec.t());
for thread in p2p_threads.iter().take(spares) {
thread.abort();
}
2023-06-28 20:02:57 +03:00
// wait for a block
sleep(Duration::from_secs(block_time)).await;
if syncer_tributary
.reader()
.parsed_commit(&syncer_tributary.tip().await)
.unwrap()
.validators
.iter()
.any(|signer| signer == &syncer_key.to_bytes())
{
return;
}
panic!("synced tributary didn't start participating in consensus");
}