serai/processor/src/batch_signer.rs

use core::{marker::PhantomData, fmt};
use std::collections::HashMap;

use rand_core::OsRng;

use frost::{
  curve::Ristretto,
  ThresholdKeys, FrostError,
  algorithm::Algorithm,
  sign::{
    Writable, PreprocessMachine, SignMachine, SignatureMachine, AlgorithmMachine,
    AlgorithmSignMachine, AlgorithmSignatureMachine,
  },
};
use frost_schnorrkel::Schnorrkel;

use log::{info, debug, warn};

use scale::Encode;
use serai_client::{
  primitives::{NetworkId, BlockHash},
  in_instructions::primitives::{Batch, SignedBatch, batch_message},
  validator_sets::primitives::Session,
};

use messages::coordinator::*;
use crate::{Get, DbTxn, Db, create_db};

// Generate an ID unique to a Batch
fn batch_sign_id(network: NetworkId, id: u32) -> [u8; 5] {
  (network, id).encode().try_into().unwrap()
}

create_db!(
  BatchSignerDb {
    CompletedDb: (id: [u8; 5]) -> (),
    AttemptDb: (id: [u8; 5], attempt: u32) -> (),
    BatchDb: (block: BlockHash) -> SignedBatch
  }
);

type Preprocess = <AlgorithmMachine<Ristretto, Schnorrkel> as PreprocessMachine>::Preprocess;
type SignatureShare = <AlgorithmSignMachine<Ristretto, Schnorrkel> as SignMachine<
  <Schnorrkel as Algorithm<Ristretto>>::Signature,
>>::SignatureShare;

pub struct BatchSigner<D: Db> {
  db: PhantomData<D>,

  network: NetworkId,
  session: Session,
  keys: Vec<ThresholdKeys<Ristretto>>,

  signable: HashMap<[u8; 5], Batch>,
  attempt: HashMap<[u8; 5], u32>,
  #[allow(clippy::type_complexity)]
  preprocessing:
    HashMap<[u8; 5], (Vec<AlgorithmSignMachine<Ristretto, Schnorrkel>>, Vec<Preprocess>)>,
  #[allow(clippy::type_complexity)]
  signing:
    HashMap<[u8; 5], (AlgorithmSignatureMachine<Ristretto, Schnorrkel>, Vec<SignatureShare>)>,
}

impl<D: Db> fmt::Debug for BatchSigner<D> {
  fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
    fmt
      .debug_struct("BatchSigner")
      .field("signable", &self.signable)
      .field("attempt", &self.attempt)
      .finish_non_exhaustive()
  }
}

impl<D: Db> BatchSigner<D> {
  pub fn new(
    network: NetworkId,
    session: Session,
    keys: Vec<ThresholdKeys<Ristretto>>,
  ) -> BatchSigner<D> {
    assert!(!keys.is_empty());
    BatchSigner {
      db: PhantomData,

      network,
      session,
      keys,

      signable: HashMap::new(),
      attempt: HashMap::new(),
      preprocessing: HashMap::new(),
      signing: HashMap::new(),
    }
  }

  fn verify_id(&self, id: &SubstrateSignId) -> Result<(Session, [u8; 5], u32), ()> {
    let SubstrateSignId { session, id, attempt } = id;
    let SubstrateSignableId::Batch(id) = id else { panic!("BatchSigner handed non-Batch") };

    assert_eq!(session, &self.session);

    // Check the attempt lines up
    match self.attempt.get(id) {
      // If we don't have an attempt logged, it's because the coordinator is faulty OR because we
      // rebooted OR we detected the signed batch on chain
      // The latter is the expected flow for batches not actively being participated in
      None => {
        warn!("not attempting batch {} #{}", hex::encode(id), attempt);
        Err(())?;
      }
      Some(our_attempt) => {
        if attempt != our_attempt {
          warn!(
            "sent signing data for batch {} #{} yet we have attempt #{}",
            hex::encode(id),
            attempt,
            attempt
          );
          Err(())?;
        }
      }
    }

    Ok((*session, *id, *attempt))
  }

  #[must_use]
  async fn attempt(
    &mut self,
    txn: &mut D::Transaction<'_>,
    id: [u8; 5],
    attempt: u32,
  ) -> Option<ProcessorMessage> {
    // See above commentary for why this doesn't emit SignedBatch
    if CompletedDb::get(txn, id).is_some() {
      return None;
    }

    // Check if we're already working on this attempt
    if let Some(curr_attempt) = self.attempt.get(&id) {
      if curr_attempt >= &attempt {
        warn!(
          "told to attempt {} #{} yet we're already working on {}",
          hex::encode(id),
          attempt,
          curr_attempt
        );
        return None;
      }
    }

    // Start this attempt
    let block = if let Some(batch) = self.signable.get(&id) {
      batch.block
    } else {
      warn!("told to attempt signing a batch we aren't currently signing for");
      return None;
    };

    // Delete any existing machines
    self.preprocessing.remove(&id);
    self.signing.remove(&id);

    // Update the attempt number
    self.attempt.insert(id, attempt);

    info!("signing batch {} #{}", hex::encode(id), attempt);

    // If we reboot mid-sign, the current design has us abort all signs and wait for latter
    // attempts/new signing protocols
    // This is distinct from the DKG which will continue DKG sessions, even on reboot
    // This is because signing is tolerant of failures of up to 1/3rd of the group
    // The DKG requires 100% participation
    // While we could apply similar tricks as the DKG (a seeded RNG) to achieve support for
    // reboots, it's not worth the complexity when messing up here leaks our secret share
    //
    // Despite this, on reboot, we'll get told of active signing items, and may be in this
    // branch again for something we've already attempted
    //
    // Only run if this hasn't already been attempted
    // TODO: This isn't complete as this txn may not be committed with the expected timing
    if AttemptDb::get(txn, id, attempt).is_some() {
      warn!(
        "already attempted batch {}, attempt #{}. this is an error if we didn't reboot",
        hex::encode(id),
        attempt
      );
      return None;
    }
    AttemptDb::set(txn, id, attempt, &());

    let mut machines = vec![];
    let mut preprocesses = vec![];
    let mut serialized_preprocesses = vec![];
    for keys in &self.keys {
      // b"substrate" is a literal from sp-core
      let machine = AlgorithmMachine::new(Schnorrkel::new(b"substrate"), keys.clone());

      let (machine, preprocess) = machine.preprocess(&mut OsRng);
      machines.push(machine);
      serialized_preprocesses.push(preprocess.serialize().try_into().unwrap());
      preprocesses.push(preprocess);
    }
    self.preprocessing.insert(id, (machines, preprocesses));

    let id = SubstrateSignId { session: self.session, id: SubstrateSignableId::Batch(id), attempt };

    // Broadcast our preprocesses
    Some(ProcessorMessage::BatchPreprocess { id, block, preprocesses: serialized_preprocesses })
  }

  #[must_use]
  pub async fn sign(
    &mut self,
    txn: &mut D::Transaction<'_>,
    batch: Batch,
  ) -> Option<ProcessorMessage> {
    debug_assert_eq!(self.network, batch.network);
    let id = batch_sign_id(batch.network, batch.id);
    if CompletedDb::get(txn, id).is_some() {
      debug!("Sign batch order for ID we've already completed signing");
      // See batch_signed for commentary on why this simply returns
      return None;
    }

    self.signable.insert(id, batch);
    self.attempt(txn, id, 0).await
  }

  #[must_use]
  pub async fn handle(
    &mut self,
    txn: &mut D::Transaction<'_>,
    msg: CoordinatorMessage,
  ) -> Option<messages::ProcessorMessage> {
    match msg {
      CoordinatorMessage::CosignSubstrateBlock { .. } => {
        panic!("BatchSigner passed CosignSubstrateBlock")
      }

      CoordinatorMessage::SubstratePreprocesses { id, preprocesses } => {
        let (session, id, attempt) = self.verify_id(&id).ok()?;

        let substrate_sign_id =
          SubstrateSignId { session, id: SubstrateSignableId::Batch(id), attempt };

        let (machines, our_preprocesses) = match self.preprocessing.remove(&id) {
          // Either rebooted or RPC error, or some invariant
          None => {
            warn!(
              "not preprocessing for {}. this is an error if we didn't reboot",
              hex::encode(id),
            );
            return None;
          }
          Some(preprocess) => preprocess,
        };

        let mut parsed = HashMap::new();
        for l in {
          let mut keys = preprocesses.keys().cloned().collect::<Vec<_>>();
          keys.sort();
          keys
        } {
          let mut preprocess_ref = preprocesses.get(&l).unwrap().as_slice();
          let Ok(res) = machines[0].read_preprocess(&mut preprocess_ref) else {
            return Some(
              (ProcessorMessage::InvalidParticipant { id: substrate_sign_id, participant: l })
                .into(),
            );
          };
          if !preprocess_ref.is_empty() {
            return Some(
              (ProcessorMessage::InvalidParticipant { id: substrate_sign_id, participant: l })
                .into(),
            );
          }
          parsed.insert(l, res);
        }
        let preprocesses = parsed;

        // Only keep a single machine as we only need one to get the signature
        let mut signature_machine = None;
        let mut shares = vec![];
        let mut serialized_shares = vec![];
        for (m, machine) in machines.into_iter().enumerate() {
          let mut preprocesses = preprocesses.clone();
          for (i, our_preprocess) in our_preprocesses.clone().into_iter().enumerate() {
            if i != m {
              assert!(preprocesses.insert(self.keys[i].params().i(), our_preprocess).is_none());
            }
          }

          let (machine, share) = match machine
            .sign(preprocesses, &batch_message(&self.signable[&id]))
          {
            Ok(res) => res,
            Err(e) => match e {
              FrostError::InternalError(_) |
              FrostError::InvalidParticipant(_, _) |
              FrostError::InvalidSigningSet(_) |
              FrostError::InvalidParticipantQuantity(_, _) |
              FrostError::DuplicatedParticipant(_) |
              FrostError::MissingParticipant(_) => unreachable!(),

              FrostError::InvalidPreprocess(l) | FrostError::InvalidShare(l) => {
                return Some(
                  (ProcessorMessage::InvalidParticipant { id: substrate_sign_id, participant: l })
                    .into(),
                )
              }
            },
          };
          if m == 0 {
            signature_machine = Some(machine);
          }

          let mut share_bytes = [0; 32];
          share_bytes.copy_from_slice(&share.serialize());
          serialized_shares.push(share_bytes);

          shares.push(share);
        }
        self.signing.insert(id, (signature_machine.unwrap(), shares));

        // Broadcast our shares
        Some(
          (ProcessorMessage::SubstrateShare { id: substrate_sign_id, shares: serialized_shares })
            .into(),
        )
      }

      CoordinatorMessage::SubstrateShares { id, shares } => {
        let (session, id, attempt) = self.verify_id(&id).ok()?;

        let substrate_sign_id =
          SubstrateSignId { session, id: SubstrateSignableId::Batch(id), attempt };

        let (machine, our_shares) = match self.signing.remove(&id) {
          // Rebooted, RPC error, or some invariant
          None => {
            // If preprocessing has this ID, it means we were never sent the preprocess by the
            // coordinator
            if self.preprocessing.contains_key(&id) {
              panic!("never preprocessed yet signing?");
            }

            warn!(
              "not preprocessing for {}. this is an error if we didn't reboot",
              hex::encode(id)
            );
            return None;
          }
          Some(signing) => signing,
        };

        let mut parsed = HashMap::new();
        for l in {
          let mut keys = shares.keys().cloned().collect::<Vec<_>>();
          keys.sort();
          keys
        } {
          let mut share_ref = shares.get(&l).unwrap().as_slice();
          let Ok(res) = machine.read_share(&mut share_ref) else {
            return Some(
              (ProcessorMessage::InvalidParticipant { id: substrate_sign_id, participant: l })
                .into(),
            );
          };
          if !share_ref.is_empty() {
            return Some(
              (ProcessorMessage::InvalidParticipant { id: substrate_sign_id, participant: l })
                .into(),
            );
          }
          parsed.insert(l, res);
        }
        let mut shares = parsed;

        for (i, our_share) in our_shares.into_iter().enumerate().skip(1) {
          assert!(shares.insert(self.keys[i].params().i(), our_share).is_none());
        }

        let sig = match machine.complete(shares) {
          Ok(res) => res,
          Err(e) => match e {
            FrostError::InternalError(_) |
            FrostError::InvalidParticipant(_, _) |
            FrostError::InvalidSigningSet(_) |
            FrostError::InvalidParticipantQuantity(_, _) |
            FrostError::DuplicatedParticipant(_) |
            FrostError::MissingParticipant(_) => unreachable!(),

            FrostError::InvalidPreprocess(l) | FrostError::InvalidShare(l) => {
              return Some(
                (ProcessorMessage::InvalidParticipant { id: substrate_sign_id, participant: l })
                  .into(),
              )
            }
          },
        };

        info!("signed batch {} with attempt #{}", hex::encode(id), attempt);

        let batch =
          SignedBatch { batch: self.signable.remove(&id).unwrap(), signature: sig.into() };

        // Save the batch in case it's needed for recovery
        BatchDb::set(txn, batch.batch.block, &batch);
        CompletedDb::set(txn, id, &());

        // Stop trying to sign for this batch
        assert!(self.attempt.remove(&id).is_some());
        assert!(self.preprocessing.remove(&id).is_none());
        assert!(self.signing.remove(&id).is_none());

        Some((messages::substrate::ProcessorMessage::SignedBatch { batch }).into())
      }

      CoordinatorMessage::BatchReattempt { id } => {
        let SubstrateSignableId::Batch(batch_id) = id.id else {
          panic!("BatchReattempt passed non-Batch ID")
        };
        self.attempt(txn, batch_id, id.attempt).await.map(Into::into)
      }
    }
  }

  pub fn batch_signed(&mut self, txn: &mut D::Transaction<'_>, id: u32) {
    let sign_id = batch_sign_id(self.network, id);

    // Stop trying to sign for this batch
    CompletedDb::set(txn, sign_id, &());

    self.signable.remove(&sign_id);
    self.attempt.remove(&sign_id);
    self.preprocessing.remove(&sign_id);
    self.signing.remove(&sign_id);

    // This doesn't emit SignedBatch because it doesn't have access to the SignedBatch
    // This function is expected to only be called once Substrate acknowledges this block,
    // which means its batch must have been signed
    // While a successive batch's signing would also cause this block to be acknowledged, Substrate
    // guarantees a batch's ordered inclusion

    // This also doesn't return any messages since all mutation from the Batch being signed happens
    // on the substrate::CoordinatorMessage::SubstrateBlock message (which SignedBatch is meant to
    // end up triggering)
  }
}