serai-processor-bin

Moves the coordinator loop out of serai-bitcoin-processor, completing it.

Fixes a potential race condition in the message-queue regarding multiple
sockets sending messages at once.
This commit is contained in:
Luke Parker
2024-09-11 18:56:23 -04:00
parent fcd5fb85df
commit b6811f9015
22 changed files with 705 additions and 594 deletions

View File

@@ -1,43 +0,0 @@
use messages::{ProcessorMessage, CoordinatorMessage};
use message_queue::{Service, Metadata, client::MessageQueue};
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct Message {
pub id: u64,
pub msg: CoordinatorMessage,
}
#[async_trait::async_trait]
pub trait Coordinator {
async fn send(&mut self, msg: impl Send + Into<ProcessorMessage>);
async fn recv(&mut self) -> Message;
async fn ack(&mut self, msg: Message);
}
#[async_trait::async_trait]
impl Coordinator for MessageQueue {
async fn send(&mut self, msg: impl Send + Into<ProcessorMessage>) {
let msg: ProcessorMessage = msg.into();
let metadata = Metadata { from: self.service, to: Service::Coordinator, intent: msg.intent() };
let msg = borsh::to_vec(&msg).unwrap();
self.queue(metadata, msg).await;
}
async fn recv(&mut self) -> Message {
let msg = self.next(Service::Coordinator).await;
let id = msg.id;
// Deserialize it into a CoordinatorMessage
let msg: CoordinatorMessage =
borsh::from_slice(&msg.msg).expect("message wasn't a borsh-encoded CoordinatorMessage");
return Message { id, msg };
}
async fn ack(&mut self, msg: Message) {
MessageQueue::ack(self, Service::Coordinator, msg.id).await
}
}

View File

@@ -1,43 +0,0 @@
use std::io::Read;
use scale::{Encode, Decode};
use serai_client::validator_sets::primitives::{Session, KeyPair};
pub use serai_db::*;
use crate::networks::{Block, Network};
create_db!(
MainDb {
HandledMessageDb: (id: u64) -> (),
PendingActivationsDb: () -> Vec<u8>
}
);
impl PendingActivationsDb {
pub fn pending_activation<N: Network>(
getter: &impl Get,
) -> Option<(<N::Block as Block<N>>::Id, Session, KeyPair)> {
if let Some(bytes) = Self::get(getter) {
if !bytes.is_empty() {
let mut slice = bytes.as_slice();
let (session, key_pair) = <(Session, KeyPair)>::decode(&mut slice).unwrap();
let mut block_before_queue_block = <N::Block as Block<N>>::Id::default();
slice.read_exact(block_before_queue_block.as_mut()).unwrap();
assert!(slice.is_empty());
return Some((block_before_queue_block, session, key_pair));
}
}
None
}
pub fn set_pending_activation<N: Network>(
txn: &mut impl DbTxn,
block_before_queue_block: &<N::Block as Block<N>>::Id,
session: Session,
key_pair: KeyPair,
) {
let mut buf = (session, key_pair).encode();
buf.extend(block_before_queue_block.as_ref());
Self::set(txn, &buf);
}
}

View File

@@ -60,263 +60,9 @@ async fn handle_coordinator_msg<D: Db, N: Network, Co: Coordinator>(
}
}
async fn boot<N: Network, D: Db, Co: Coordinator>(
raw_db: &mut D,
network: &N,
coordinator: &mut Co,
) -> (D, TributaryMutable<N, D>, SubstrateMutable<N, D>) {
fn read_key_from_env<C: Ciphersuite>(label: &'static str) -> Zeroizing<C::F> {
let key_hex =
Zeroizing::new(env::var(label).unwrap_or_else(|| panic!("{label} wasn't provided")));
let bytes = Zeroizing::new(
hex::decode(key_hex).unwrap_or_else(|_| panic!("{label} wasn't a valid hex string")),
);
let mut repr = <C::F as PrimeField>::Repr::default();
if repr.as_ref().len() != bytes.len() {
panic!("{label} wasn't the correct length");
}
repr.as_mut().copy_from_slice(bytes.as_slice());
let res = Zeroizing::new(
Option::from(<C::F as PrimeField>::from_repr(repr))
.unwrap_or_else(|| panic!("{label} wasn't a valid scalar")),
);
repr.as_mut().zeroize();
res
}
let key_gen = KeyGen::<N, _>::new(
raw_db.clone(),
read_key_from_env::<<Ristretto as EvrfCurve>::EmbeddedCurve>("SUBSTRATE_EVRF_KEY"),
read_key_from_env::<<N::Curve as EvrfCurve>::EmbeddedCurve>("NETWORK_EVRF_KEY"),
);
let (multisig_manager, current_keys, actively_signing) =
MultisigManager::new(raw_db, network).await;
let mut batch_signer = None;
let mut signers = HashMap::new();
for (i, key) in current_keys.iter().enumerate() {
let Some((session, (substrate_keys, network_keys))) = key_gen.keys(key) else { continue };
let network_key = network_keys[0].group_key();
// If this is the oldest key, load the BatchSigner for it as the active BatchSigner
// The new key only takes responsibility once the old key is fully deprecated
//
// We don't have to load any state for this since the Scanner will re-fire any events
// necessary, only no longer scanning old blocks once Substrate acks them
if i == 0 {
batch_signer = Some(BatchSigner::new(N::NETWORK, session, substrate_keys));
}
// The Scanner re-fires events as needed for batch_signer yet not signer
// This is due to the transactions which we start signing from due to a block not being
// guaranteed to be signed before we stop scanning the block on reboot
// We could simplify the Signer flow by delaying when it acks a block, yet that'd:
// 1) Increase the startup time
// 2) Cause re-emission of Batch events, which we'd need to check the safety of
// (TODO: Do anyways?)
// 3) Violate the attempt counter (TODO: Is this already being violated?)
let mut signer = Signer::new(network.clone(), session, network_keys);
// Sign any TXs being actively signed
for (plan, tx, eventuality) in &actively_signing {
if plan.key == network_key {
let mut txn = raw_db.txn();
if let Some(msg) =
signer.sign_transaction(&mut txn, plan.id(), tx.clone(), eventuality).await
{
coordinator.send(msg).await;
}
// This should only have re-writes of existing data
drop(txn);
}
}
signers.insert(session, signer);
}
// Spawn a task to rebroadcast signed TXs yet to be mined into a finalized block
// This hedges against being dropped due to full mempools, temporarily too low of a fee...
tokio::spawn(Signer::<N, D>::rebroadcast_task(raw_db.clone(), network.clone()));
(
raw_db.clone(),
TributaryMutable { key_gen, batch_signer, cosigner: None, slash_report_signer: None, signers },
multisig_manager,
)
}
#[allow(clippy::await_holding_lock)] // Needed for txn, unfortunately can't be down-scoped
async fn run<N: Network, D: Db, Co: Coordinator>(mut raw_db: D, network: N, mut coordinator: Co) {
// We currently expect a contextless bidirectional mapping between these two values
// (which is that any value of A can be interpreted as B and vice versa)
// While we can write a contextual mapping, we have yet to do so
// This check ensures no network which doesn't have a bidirectional mapping is defined
assert_eq!(<N::Block as Block<N>>::Id::default().as_ref().len(), BlockHash([0u8; 32]).0.len());
let (main_db, mut tributary_mutable, mut substrate_mutable) =
boot(&mut raw_db, &network, &mut coordinator).await;
// We can't load this from the DB as we can't guarantee atomic increments with the ack function
// TODO: Load with a slight tolerance
let mut last_coordinator_msg = None;
loop {
let mut txn = raw_db.txn();
log::trace!("new db txn in run");
let mut outer_msg = None;
tokio::select! {
// This blocks the entire processor until it finishes handling this message
// KeyGen specifically may take a notable amount of processing time
// While that shouldn't be an issue in practice, as after processing an attempt it'll handle
// the other messages in the queue, it may be beneficial to parallelize these
// They could potentially be parallelized by type (KeyGen, Sign, Substrate) without issue
msg = coordinator.recv() => {
if let Some(last_coordinator_msg) = last_coordinator_msg {
assert_eq!(msg.id, last_coordinator_msg + 1);
}
last_coordinator_msg = Some(msg.id);
// Only handle this if we haven't already
if HandledMessageDb::get(&main_db, msg.id).is_none() {
HandledMessageDb::set(&mut txn, msg.id, &());
// This is isolated to better think about how its ordered, or rather, about how the other
// cases aren't ordered
//
// While the coordinator messages are ordered, they're not deterministically ordered
// Tributary-caused messages are deterministically ordered, and Substrate-caused messages
// are deterministically-ordered, yet they're both shoved into a singular queue
// The order at which they're shoved in together isn't deterministic
//
// This is safe so long as Tributary and Substrate messages don't both expect mutable
// references over the same data
handle_coordinator_msg(
&mut txn,
&network,
&mut coordinator,
&mut tributary_mutable,
&mut substrate_mutable,
&msg,
).await;
}
outer_msg = Some(msg);
},
scanner_event = substrate_mutable.next_scanner_event() => {
let msg = substrate_mutable.scanner_event_to_multisig_event(
&mut txn,
&network,
scanner_event
).await;
match msg {
MultisigEvent::Batches(retired_key_new_key, batches) => {
// Start signing this batch
for batch in batches {
info!("created batch {} ({} instructions)", batch.id, batch.instructions.len());
// The coordinator expects BatchPreprocess to immediately follow Batch
coordinator.send(
messages::substrate::ProcessorMessage::Batch { batch: batch.clone() }
).await;
if let Some(batch_signer) = tributary_mutable.batch_signer.as_mut() {
if let Some(msg) = batch_signer.sign(&mut txn, batch) {
coordinator.send(msg).await;
}
}
}
if let Some((retired_key, new_key)) = retired_key_new_key {
// Safe to mutate since all signing operations are done and no more will be added
if let Some(retired_session) = SessionDb::get(&txn, retired_key.to_bytes().as_ref()) {
tributary_mutable.signers.remove(&retired_session);
}
tributary_mutable.batch_signer.take();
let keys = tributary_mutable.key_gen.keys(&new_key);
if let Some((session, (substrate_keys, _))) = keys {
tributary_mutable.batch_signer =
Some(BatchSigner::new(N::NETWORK, session, substrate_keys));
}
}
},
MultisigEvent::Completed(key, id, tx) => {
if let Some(session) = SessionDb::get(&txn, &key) {
let signer = tributary_mutable.signers.get_mut(&session).unwrap();
if let Some(msg) = signer.completed(&mut txn, id, &tx) {
coordinator.send(msg).await;
}
}
}
}
},
}
txn.commit();
if let Some(msg) = outer_msg {
coordinator.ack(msg).await;
}
}
}
#[tokio::main]
async fn main() {
// Override the panic handler with one which will panic if any tokio task panics
{
let existing = std::panic::take_hook();
std::panic::set_hook(Box::new(move |panic| {
existing(panic);
const MSG: &str = "exiting the process due to a task panicking";
println!("{MSG}");
log::error!("{MSG}");
std::process::exit(1);
}));
}
if std::env::var("RUST_LOG").is_err() {
std::env::set_var("RUST_LOG", serai_env::var("RUST_LOG").unwrap_or_else(|| "info".to_string()));
}
env_logger::init();
#[allow(unused_variables, unreachable_code)]
let db = {
#[cfg(all(feature = "parity-db", feature = "rocksdb"))]
panic!("built with parity-db and rocksdb");
#[cfg(all(feature = "parity-db", not(feature = "rocksdb")))]
let db =
serai_db::new_parity_db(&serai_env::var("DB_PATH").expect("path to DB wasn't specified"));
#[cfg(feature = "rocksdb")]
let db =
serai_db::new_rocksdb(&serai_env::var("DB_PATH").expect("path to DB wasn't specified"));
db
};
// Network configuration
let url = {
let login = env::var("NETWORK_RPC_LOGIN").expect("network RPC login wasn't specified");
let hostname = env::var("NETWORK_RPC_HOSTNAME").expect("network RPC hostname wasn't specified");
let port = env::var("NETWORK_RPC_PORT").expect("network port domain wasn't specified");
"http://".to_string() + &login + "@" + &hostname + ":" + &port
};
let network_id = match env::var("NETWORK").expect("network wasn't specified").as_str() {
"bitcoin" => NetworkId::Bitcoin,
"ethereum" => NetworkId::Ethereum,
"monero" => NetworkId::Monero,
_ => panic!("unrecognized network"),
};
let coordinator = MessageQueue::from_env(Service::Processor(network_id));
match network_id {
#[cfg(feature = "bitcoin")]
NetworkId::Bitcoin => run(db, Bitcoin::new(url).await, coordinator).await,
#[cfg(feature = "ethereum")]
NetworkId::Ethereum => {
let relayer_hostname = env::var("ETHEREUM_RELAYER_HOSTNAME")
@@ -327,8 +73,5 @@ async fn main() {
let relayer_url = relayer_hostname + ":" + &relayer_port;
run(db.clone(), Ethereum::new(db, url, relayer_url).await, coordinator).await
}
#[cfg(feature = "monero")]
NetworkId::Monero => run(db, Monero::new(url).await, coordinator).await,
_ => panic!("spawning a processor for an unsupported network"),
}
}