2023-04-24 02:50:03 -04:00
|
|
|
use core::time::Duration;
|
2023-09-25 19:28:53 -04:00
|
|
|
use std::{sync::Arc, collections::HashSet};
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
use rand_core::OsRng;
|
|
|
|
|
|
|
|
|
|
use ciphersuite::{group::GroupEncoding, Ciphersuite, Ristretto};
|
|
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 16:57:21 -05:00
|
|
|
use tokio::{
|
|
|
|
|
sync::{mpsc, broadcast},
|
|
|
|
|
time::sleep,
|
|
|
|
|
};
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
use serai_db::MemDb;
|
|
|
|
|
|
|
|
|
|
use tributary::Tributary;
|
|
|
|
|
|
|
|
|
|
use crate::{
|
|
|
|
|
tributary::Transaction,
|
2023-10-14 14:56:02 -04:00
|
|
|
ActiveTributary, TributaryEvent,
|
2023-10-13 22:40:11 -04:00
|
|
|
p2p::{heartbeat_tributaries_task, handle_p2p_task},
|
2023-08-08 15:12:47 -04:00
|
|
|
tests::{
|
|
|
|
|
LocalP2p,
|
|
|
|
|
tributary::{new_keys, new_spec, new_tributaries},
|
|
|
|
|
},
|
2023-04-24 02:50:03 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn sync_test() {
|
|
|
|
|
let mut keys = new_keys(&mut OsRng);
|
|
|
|
|
let spec = new_spec(&mut OsRng, &keys);
|
|
|
|
|
// Ensure this can have a node fail
|
|
|
|
|
assert!(spec.n() > spec.t());
|
|
|
|
|
|
|
|
|
|
let mut tributaries = new_tributaries(&keys, &spec).await;
|
|
|
|
|
|
|
|
|
|
// Keep a Tributary back, effectively having it offline
|
|
|
|
|
let syncer_key = keys.pop().unwrap();
|
|
|
|
|
let (syncer_p2p, syncer_tributary) = tributaries.pop().unwrap();
|
|
|
|
|
|
|
|
|
|
// Have the rest form a P2P net
|
2023-09-25 19:28:53 -04:00
|
|
|
let mut tributary_senders = vec![];
|
2023-04-24 02:50:03 -04:00
|
|
|
let mut tributary_arcs = vec![];
|
2023-06-28 20:02:57 +03:00
|
|
|
let mut p2p_threads = vec![];
|
2023-10-14 14:57:46 -04:00
|
|
|
for (p2p, tributary) in tributaries.drain(..) {
|
2023-09-25 19:28:53 -04:00
|
|
|
let tributary = Arc::new(tributary);
|
2023-04-24 02:50:03 -04:00
|
|
|
tributary_arcs.push(tributary.clone());
|
2023-09-25 19:28:53 -04:00
|
|
|
let (new_tributary_send, new_tributary_recv) = broadcast::channel(5);
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 16:57:21 -05:00
|
|
|
let (cosign_send, _) = mpsc::unbounded_channel();
|
|
|
|
|
let thread = tokio::spawn(handle_p2p_task(p2p, cosign_send, new_tributary_recv));
|
2023-09-25 19:28:53 -04:00
|
|
|
new_tributary_send
|
2023-10-14 14:56:02 -04:00
|
|
|
.send(TributaryEvent::NewTributary(ActiveTributary { spec: spec.clone(), tributary }))
|
2023-09-25 19:28:53 -04:00
|
|
|
.map_err(|_| "failed to send ActiveTributary")
|
|
|
|
|
.unwrap();
|
|
|
|
|
tributary_senders.push(new_tributary_send);
|
2023-06-28 20:02:57 +03:00
|
|
|
p2p_threads.push(thread);
|
2023-04-24 02:50:03 -04:00
|
|
|
}
|
|
|
|
|
let tributaries = tributary_arcs;
|
|
|
|
|
|
2023-10-13 21:46:26 -04:00
|
|
|
// After four blocks of time, we should have a new block
|
2023-04-24 02:50:03 -04:00
|
|
|
// We don't wait one block of time as we may have missed the chance for the first block
|
|
|
|
|
// We don't wait two blocks because we may have missed the chance, and then had a failure to
|
2023-10-13 21:46:26 -04:00
|
|
|
// propose by our 'offline' validator, which would cause the Tendermint round time to increase,
|
|
|
|
|
// requiring a longer delay
|
2023-04-24 02:50:03 -04:00
|
|
|
let block_time = u64::from(Tributary::<MemDb, Transaction, LocalP2p>::block_time());
|
2023-10-13 21:46:26 -04:00
|
|
|
sleep(Duration::from_secs(4 * block_time)).await;
|
2023-09-25 19:28:53 -04:00
|
|
|
let tip = tributaries[0].tip().await;
|
2023-04-24 02:50:03 -04:00
|
|
|
assert!(tip != spec.genesis());
|
|
|
|
|
|
|
|
|
|
// Sleep one second to make sure this block propagates
|
|
|
|
|
sleep(Duration::from_secs(1)).await;
|
|
|
|
|
// Make sure every tributary has it
|
|
|
|
|
for tributary in &tributaries {
|
2023-09-25 19:28:53 -04:00
|
|
|
assert!(tributary.reader().block(&tip).is_some());
|
2023-04-24 02:50:03 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Now that we've confirmed the other tributaries formed a net without issue, drop the syncer's
|
|
|
|
|
// pending P2P messages
|
2023-10-13 19:47:58 -04:00
|
|
|
syncer_p2p.1.write().await.1.last_mut().unwrap().clear();
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
// Have it join the net
|
|
|
|
|
let syncer_key = Ristretto::generator() * *syncer_key;
|
2023-09-25 19:28:53 -04:00
|
|
|
let syncer_tributary = Arc::new(syncer_tributary);
|
|
|
|
|
let (syncer_tributary_send, syncer_tributary_recv) = broadcast::channel(5);
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 16:57:21 -05:00
|
|
|
let (cosign_send, _) = mpsc::unbounded_channel();
|
|
|
|
|
tokio::spawn(handle_p2p_task(syncer_p2p.clone(), cosign_send, syncer_tributary_recv));
|
2023-09-25 19:28:53 -04:00
|
|
|
syncer_tributary_send
|
2023-10-14 14:56:02 -04:00
|
|
|
.send(TributaryEvent::NewTributary(ActiveTributary {
|
|
|
|
|
spec: spec.clone(),
|
|
|
|
|
tributary: syncer_tributary.clone(),
|
|
|
|
|
}))
|
2023-09-25 19:28:53 -04:00
|
|
|
.map_err(|_| "failed to send ActiveTributary to syncer")
|
|
|
|
|
.unwrap();
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
// It shouldn't automatically catch up. If it somehow was, our test would be broken
|
|
|
|
|
// Sanity check this
|
2023-09-25 19:28:53 -04:00
|
|
|
let tip = tributaries[0].tip().await;
|
2023-09-26 17:28:41 -04:00
|
|
|
// Wait until a new block occurs
|
|
|
|
|
sleep(Duration::from_secs(3 * block_time)).await;
|
|
|
|
|
// Make sure a new block actually occurred
|
2023-09-25 19:28:53 -04:00
|
|
|
assert!(tributaries[0].tip().await != tip);
|
2023-09-26 17:28:41 -04:00
|
|
|
// Make sure the new block alone didn't trigger catching up
|
2023-09-25 19:28:53 -04:00
|
|
|
assert_eq!(syncer_tributary.tip().await, spec.genesis());
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
// Start the heartbeat protocol
|
2023-09-25 19:28:53 -04:00
|
|
|
let (syncer_heartbeat_tributary_send, syncer_heartbeat_tributary_recv) = broadcast::channel(5);
|
2023-10-13 22:40:11 -04:00
|
|
|
tokio::spawn(heartbeat_tributaries_task(syncer_p2p, syncer_heartbeat_tributary_recv));
|
2023-09-25 19:28:53 -04:00
|
|
|
syncer_heartbeat_tributary_send
|
2023-10-14 14:56:02 -04:00
|
|
|
.send(TributaryEvent::NewTributary(ActiveTributary {
|
|
|
|
|
spec: spec.clone(),
|
|
|
|
|
tributary: syncer_tributary.clone(),
|
|
|
|
|
}))
|
2023-09-25 19:28:53 -04:00
|
|
|
.map_err(|_| "failed to send ActiveTributary to heartbeat")
|
|
|
|
|
.unwrap();
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
// The heartbeat is once every 10 blocks
|
|
|
|
|
sleep(Duration::from_secs(10 * block_time)).await;
|
2023-09-25 19:28:53 -04:00
|
|
|
assert!(syncer_tributary.tip().await != spec.genesis());
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
// Verify it synced to the tip
|
|
|
|
|
let syncer_tip = {
|
2023-09-25 19:28:53 -04:00
|
|
|
let tributary = &tributaries[0];
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
let tip = tributary.tip().await;
|
|
|
|
|
let syncer_tip = syncer_tributary.tip().await;
|
|
|
|
|
// Allow a one block tolerance in case of race conditions
|
2023-04-24 06:50:40 -04:00
|
|
|
assert!(
|
|
|
|
|
HashSet::from([tip, tributary.reader().block(&tip).unwrap().parent()]).contains(&syncer_tip)
|
|
|
|
|
);
|
2023-04-24 02:50:03 -04:00
|
|
|
syncer_tip
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
sleep(Duration::from_secs(block_time)).await;
|
|
|
|
|
|
|
|
|
|
// Verify it's now keeping up
|
2023-09-25 19:28:53 -04:00
|
|
|
assert!(syncer_tributary.tip().await != syncer_tip);
|
2023-04-24 02:50:03 -04:00
|
|
|
|
|
|
|
|
// Verify it's now participating in consensus
|
2023-06-28 20:02:57 +03:00
|
|
|
// Because only `t` validators are used in a commit, take n - t nodes offline
|
|
|
|
|
// leaving only `t` nodes. Which should force it to participate in the consensus
|
|
|
|
|
// of next blocks.
|
2023-06-28 15:04:48 -04:00
|
|
|
let spares = usize::from(spec.n() - spec.t());
|
|
|
|
|
for thread in p2p_threads.iter().take(spares) {
|
|
|
|
|
thread.abort();
|
2023-04-24 02:50:03 -04:00
|
|
|
}
|
2023-06-28 20:02:57 +03:00
|
|
|
|
|
|
|
|
// wait for a block
|
|
|
|
|
sleep(Duration::from_secs(block_time)).await;
|
|
|
|
|
|
|
|
|
|
if syncer_tributary
|
|
|
|
|
.reader()
|
|
|
|
|
.parsed_commit(&syncer_tributary.tip().await)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.validators
|
|
|
|
|
.iter()
|
|
|
|
|
.any(|signer| signer == &syncer_key.to_bytes())
|
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-24 02:50:03 -04:00
|
|
|
panic!("synced tributary didn't start participating in consensus");
|
|
|
|
|
}
|