serai/coordinator/src/tributary/scanner.rs

use core::{marker::PhantomData, ops::Deref, future::Future, time::Duration};
use std::sync::Arc;

use zeroize::Zeroizing;

use ciphersuite::{group::GroupEncoding, Ciphersuite, Ristretto};
use frost::Participant;

use tokio::sync::broadcast;

use scale::{Encode, Decode};
use serai_client::{
  primitives::{SeraiAddress, Signature},
  validator_sets::primitives::{KeyPair, ValidatorSet},
  Serai,
};

use serai_db::DbTxn;

use processor_messages::coordinator::SubstrateSignableId;

use tributary::{
  TransactionKind, Transaction as TributaryTransaction, TransactionError, Block, TributaryReader,
  tendermint::{
    tx::{TendermintTx, Evidence, decode_signed_message},
    TendermintNetwork,
  },
};

use crate::{Db, processors::Processors, substrate::BatchInstructionsHashDb, tributary::*, P2p};

#[derive(Clone, Copy, PartialEq, Eq, Debug, Encode, Decode)]
pub enum RecognizedIdType {
  Batch,
  Plan,
}

#[async_trait::async_trait]
pub trait RIDTrait {
  async fn recognized_id(
    &self,
    set: ValidatorSet,
    genesis: [u8; 32],
    kind: RecognizedIdType,
    id: Vec<u8>,
  );
}
#[async_trait::async_trait]
impl<
    FRid: Send + Future<Output = ()>,
    F: Sync + Fn(ValidatorSet, [u8; 32], RecognizedIdType, Vec<u8>) -> FRid,
  > RIDTrait for F
{
  async fn recognized_id(
    &self,
    set: ValidatorSet,
    genesis: [u8; 32],
    kind: RecognizedIdType,
    id: Vec<u8>,
  ) {
    (self)(set, genesis, kind, id).await
  }
}

#[async_trait::async_trait]
pub trait PublishSeraiTransaction {
  async fn publish_set_keys(
    &self,
    set: ValidatorSet,
    removed: Vec<SeraiAddress>,
    key_pair: KeyPair,
    signature: Signature,
  );
}

mod impl_pst_for_serai {
  use super::*;

  use serai_client::SeraiValidatorSets;

  // Uses a macro because Rust can't resolve the lifetimes/generics around the check function
  // check is expected to return true if the effect has already occurred
  // The generated publish function will return true if *we* published the transaction
  macro_rules! common_pst {
    ($Meta: ty, $check: ident) => {
      async fn publish(
        serai: &Serai,
        set: ValidatorSet,
        tx: serai_client::Transaction,
        meta: $Meta,
      ) -> bool {
        loop {
          match serai.publish(&tx).await {
            Ok(_) => return true,
            // This is assumed to be some ephemeral error due to the assumed fault-free
            // creation
            // TODO2: Differentiate connection errors from invariants
            Err(e) => {
              if let Ok(serai) = serai.as_of_latest_finalized_block().await {
                let serai = serai.validator_sets();

                // The following block is irrelevant, and can/likely will fail, if we're publishing
                // a TX for an old session
                // If we're on a newer session, move on
                if let Ok(Some(current_session)) = serai.session(set.network).await {
                  if current_session.0 > set.session.0 {
                    log::warn!(
                      "trying to publish a TX relevant to set {set:?} which isn't the latest"
                    );
                    return false;
                  }
                }

                // Check if someone else published the TX in question
                if $check(serai, set, meta).await {
                  return false;
                }

                log::error!("couldn't connect to Serai node to publish TX: {e:?}");
                tokio::time::sleep(core::time::Duration::from_secs(5)).await;
              }
            }
          }
        }
      }
    };
  }

  #[async_trait::async_trait]
  impl PublishSeraiTransaction for Serai {
    async fn publish_set_keys(
      &self,
      set: ValidatorSet,
      removed: Vec<SeraiAddress>,
      key_pair: KeyPair,
      signature: Signature,
    ) {
      let tx = SeraiValidatorSets::set_keys(set.network, removed, key_pair, signature);
      async fn check(serai: SeraiValidatorSets<'_>, set: ValidatorSet, (): ()) -> bool {
        if matches!(serai.keys(set).await, Ok(Some(_))) {
          log::info!("another coordinator set key pair for {:?}", set);
          return true;
        }
        false
      }
      common_pst!((), check);
      if publish(self, set, tx, ()).await {
        log::info!("published set keys for {set:?}");
      }
    }
  }
}

#[async_trait::async_trait]
pub trait PTTTrait {
  async fn publish_tributary_tx(&self, tx: Transaction);
}
#[async_trait::async_trait]
impl<FPtt: Send + Future<Output = ()>, F: Sync + Fn(Transaction) -> FPtt> PTTTrait for F {
  async fn publish_tributary_tx(&self, tx: Transaction) {
    (self)(tx).await
  }
}

pub struct TributaryBlockHandler<
  'a,
  T: DbTxn,
  Pro: Processors,
  PST: PublishSeraiTransaction,
  PTT: PTTTrait,
  RID: RIDTrait,
  P: P2p,
> {
  pub txn: &'a mut T,
  pub our_key: &'a Zeroizing<<Ristretto as Ciphersuite>::F>,
  pub recognized_id: &'a RID,
  pub processors: &'a Pro,
  pub publish_serai_tx: &'a PST,
  pub publish_tributary_tx: &'a PTT,
  pub spec: &'a TributarySpec,
  block: Block<Transaction>,
  pub block_number: u32,
  _p2p: PhantomData<P>,
}

impl<
    T: DbTxn,
    Pro: Processors,
    PST: PublishSeraiTransaction,
    PTT: PTTTrait,
    RID: RIDTrait,
    P: P2p,
  > TributaryBlockHandler<'_, T, Pro, PST, PTT, RID, P>
{
  pub fn fatal_slash(&mut self, slashing: [u8; 32], reason: &str) {
    // TODO: If this fatal slash puts the remaining set below the threshold, spin

    let genesis = self.spec.genesis();

    log::warn!("fatally slashing {}. reason: {}", hex::encode(slashing), reason);
    FatallySlashed::set_fatally_slashed(self.txn, genesis, slashing);
    // TODO: disconnect the node from network/ban from further participation in all Tributaries

    // TODO: If during DKG, trigger a re-attempt
    // Despite triggering a re-attempt, this DKG may still complete and may become in-use
  }

  // TODO: Once Substrate confirms a key, we need to rotate our validator set OR form a second
  // Tributary post-DKG
  // https://github.com/serai-dex/serai/issues/426

  pub fn fatal_slash_with_participant_index(
    &mut self,
    removed: &[<Ristretto as Ciphersuite>::G],
    i: Participant,
    reason: &str,
  ) {
    // Resolve from Participant to <Ristretto as Ciphersuite>::G
    let i = u16::from(i);
    let mut validator = None;
    for (potential, _) in self.spec.validators() {
      let v_i = self.spec.i(removed, potential).unwrap();
      if (u16::from(v_i.start) <= i) && (i < u16::from(v_i.end)) {
        validator = Some(potential);
        break;
      }
    }
    let validator = validator.unwrap();

    self.fatal_slash(validator.to_bytes(), reason);
  }

  async fn handle<D: Db>(mut self) {
    log::info!("found block for Tributary {:?}", self.spec.set());

    let transactions = self.block.transactions.clone();
    for tx in transactions {
      match tx {
        TributaryTransaction::Tendermint(TendermintTx::SlashEvidence(ev)) => {
          // Since the evidence is on the chain, it should already have been validated
          // We can just punish the signer
          let data = match ev {
            Evidence::ConflictingMessages(first, second) |
            Evidence::ConflictingPrecommit(first, second) => (first, Some(second)),
            Evidence::InvalidPrecommit(first) | Evidence::InvalidValidRound(first) => (first, None),
          };
          let msgs = (
            decode_signed_message::<TendermintNetwork<D, Transaction, P>>(&data.0).unwrap(),
            if data.1.is_some() {
              Some(
                decode_signed_message::<TendermintNetwork<D, Transaction, P>>(&data.1.unwrap())
                  .unwrap(),
              )
            } else {
              None
            },
          );

          // Since anything with evidence is fundamentally faulty behavior, not just temporal
          // errors, mark the node as fatally slashed
          self.fatal_slash(msgs.0.msg.sender, &format!("invalid tendermint messages: {msgs:?}"));
        }
        TributaryTransaction::Application(tx) => {
          self.handle_application_tx(tx).await;
        }
      }
    }

    let genesis = self.spec.genesis();
    for topic in ReattemptDb::take(self.txn, genesis, self.block_number) {
      let attempt = AttemptDb::start_next_attempt(self.txn, genesis, topic);
      log::info!("re-attempting {topic:?} with attempt {attempt}");

      /*
        All of these have the same common flow:

        1) Check if this re-attempt is actually needed
        2) If so, dispatch whatever events as needed

        This is because we *always* re-attempt any protocol which had participation. That doesn't
        mean we *should* re-attempt this protocol.

        The alternatives were:
        1) Note on-chain we completed a protocol, halting re-attempts upon 34%.
        2) Vote on-chain to re-attempt a protocol.

        This schema doesn't have any additional messages upon the success case (whereas
        alternative #1 does) and doesn't have overhead (as alternative #2 does, sending votes and
        then preprocesses. This only sends preprocesses).
      */
      match topic {
        Topic::Dkg => {
          FatalSlashesAsOfDkgAttempt::set(
            self.txn,
            genesis,
            attempt,
            &FatalSlashes::get(self.txn, genesis).unwrap_or(vec![]),
          );

          if DkgCompleted::get(self.txn, genesis).is_none() {
            // Since it wasn't completed, instruct the processor to start the next attempt
            let id =
              processor_messages::key_gen::KeyGenId { session: self.spec.set().session, attempt };

            let removed = crate::tributary::latest_removed(self.txn, genesis);
            let our_i =
              self.spec.i(&removed, Ristretto::generator() * self.our_key.deref()).unwrap();

            // TODO: Don't fatal slash, yet don't include, parties who have been offline so long as
            // we still meet the needed threshold. This will have to have any parties removed for
            // being offline, who aren't participating in the confirmed key, drop the Tributary and
            // notify their processor.

            // TODO: Instead of DKG confirmations taking a n-of-n MuSig, have it take a t-of-n with
            // a specification of those explicitly removed and those removed due to being offline.

            let params =
              frost::ThresholdParams::new(self.spec.t(), self.spec.n(&removed), our_i.start)
                .unwrap();
            let shares = u16::from(our_i.end) - u16::from(our_i.start);

            self
              .processors
              .send(
                self.spec.set().network,
                processor_messages::key_gen::CoordinatorMessage::GenerateKey { id, params, shares },
              )
              .await;
          }
        }
        Topic::DkgConfirmation => {
          panic!("re-attempting DkgConfirmation when we should be re-attempting the Dkg")
        }
        Topic::SubstrateSign(inner_id) => {
          let id = processor_messages::coordinator::SubstrateSignId {
            session: self.spec.set().session,
            id: inner_id,
            attempt,
          };
          match inner_id {
            SubstrateSignableId::CosigningSubstrateBlock(block) => {
              let block_number = SeraiBlockNumber::get(self.txn, block)
                .expect("couldn't get the block number for prior attempted cosign");

              // Check if the cosigner has a signature from our set for this block/a newer one
              let latest_cosign =
                crate::cosign_evaluator::LatestCosign::get(self.txn, self.spec.set().network)
                  .map_or(0, |cosign| cosign.block_number);
              if latest_cosign < block_number {
                // Instruct the processor to start the next attempt
                self
                  .processors
                  .send(
                    self.spec.set().network,
                    processor_messages::coordinator::CoordinatorMessage::CosignSubstrateBlock {
                      id,
                      block_number,
                    },
                  )
                  .await;
              }
            }
            SubstrateSignableId::Batch(batch) => {
              // If the Batch hasn't appeared on-chain...
              if BatchInstructionsHashDb::get(self.txn, self.spec.set().network, batch).is_none() {
                // Instruct the processor to start the next attempt
                // The processor won't continue if it's already signed a Batch
                // Prior checking if the Batch is on-chain just may reduce the non-participating
                // 33% from publishing their re-attempt messages
                self
                  .processors
                  .send(
                    self.spec.set().network,
                    processor_messages::coordinator::CoordinatorMessage::BatchReattempt { id },
                  )
                  .await;
              }
            }
          }
        }
        Topic::Sign(id) => {
          // Instruct the processor to start the next attempt
          // If it has already noted a completion, it won't send a preprocess and will simply drop
          // the re-attempt message
          self
            .processors
            .send(
              self.spec.set().network,
              processor_messages::sign::CoordinatorMessage::Reattempt {
                id: processor_messages::sign::SignId {
                  session: self.spec.set().session,
                  id,
                  attempt,
                },
              },
            )
            .await;
        }
      }
    }
  }
}

#[allow(clippy::too_many_arguments)]
pub(crate) async fn handle_new_blocks<
  D: Db,
  Pro: Processors,
  PST: PublishSeraiTransaction,
  PTT: PTTTrait,
  RID: RIDTrait,
  P: P2p,
>(
  db: &mut D,
  key: &Zeroizing<<Ristretto as Ciphersuite>::F>,
  recognized_id: &RID,
  processors: &Pro,
  publish_serai_tx: &PST,
  publish_tributary_tx: &PTT,
  spec: &TributarySpec,
  tributary: &TributaryReader<D, Transaction>,
) {
  let genesis = tributary.genesis();
  let mut last_block = LastHandledBlock::get(db, genesis).unwrap_or(genesis);
  let mut block_number = TributaryBlockNumber::get(db, last_block).unwrap_or(0);
  while let Some(next) = tributary.block_after(&last_block) {
    let block = tributary.block(&next).unwrap();
    block_number += 1;

    // Make sure we have all of the provided transactions for this block
    for tx in &block.transactions {
      // Provided TXs will appear first in the Block, so we can break after we hit a non-Provided
      let TransactionKind::Provided(order) = tx.kind() else {
        break;
      };

      // make sure we have all the provided txs in this block locally
      if !tributary.locally_provided_txs_in_block(&block.hash(), order) {
        return;
      }
    }

    let mut txn = db.txn();
    TributaryBlockNumber::set(&mut txn, next, &block_number);
    (TributaryBlockHandler {
      txn: &mut txn,
      spec,
      our_key: key,
      recognized_id,
      processors,
      publish_serai_tx,
      publish_tributary_tx,
      block,
      block_number,
      _p2p: PhantomData::<P>,
    })
    .handle::<D>()
    .await;
    last_block = next;
    LastHandledBlock::set(&mut txn, genesis, &next);
    txn.commit();
  }
}

pub(crate) async fn scan_tributaries_task<
  D: Db,
  Pro: Processors,
  P: P2p,
  RID: 'static + Send + Sync + Clone + RIDTrait,
>(
  raw_db: D,
  key: Zeroizing<<Ristretto as Ciphersuite>::F>,
  recognized_id: RID,
  processors: Pro,
  serai: Arc<Serai>,
  mut tributary_event: broadcast::Receiver<crate::TributaryEvent<D, P>>,
) {
  log::info!("scanning tributaries");

  loop {
    match tributary_event.recv().await {
      Ok(crate::TributaryEvent::NewTributary(crate::ActiveTributary { spec, tributary })) => {
        // For each Tributary, spawn a dedicated scanner task
        tokio::spawn({
          let raw_db = raw_db.clone();
          let key = key.clone();
          let recognized_id = recognized_id.clone();
          let processors = processors.clone();
          let serai = serai.clone();
          async move {
            let spec = &spec;
            let reader = tributary.reader();
            let mut tributary_db = raw_db.clone();
            loop {
              // Check if the set was retired, and if so, don't further operate
              if crate::db::RetiredTributaryDb::get(&raw_db, spec.set()).is_some() {
                break;
              }

              // Obtain the next block notification now to prevent obtaining it immediately after
              // the next block occurs
              let next_block_notification = tributary.next_block_notification().await;

              handle_new_blocks::<_, _, _, _, _, P>(
                &mut tributary_db,
                &key,
                &recognized_id,
                &processors,
                &*serai,
                &|tx: Transaction| {
                  let tributary = tributary.clone();
                  async move {
                    match tributary.add_transaction(tx.clone()).await {
                      Ok(_) => {}
                      // Can happen as this occurs on a distinct DB TXN
                      Err(TransactionError::InvalidNonce) => {
                        log::warn!(
                          "publishing TX {tx:?} returned InvalidNonce. was it already added?"
                        )
                      }
                      Err(e) => panic!("created an invalid transaction: {e:?}"),
                    }
                  }
                },
                spec,
                &reader,
              )
              .await;

              // Run either when the notification fires, or every interval of block_time
              let _ = tokio::time::timeout(
                Duration::from_secs(tributary::Tributary::<D, Transaction, P>::block_time().into()),
                next_block_notification,
              )
              .await;
            }
          }
        });
      }
      // The above loop simply checks the DB every few seconds, voiding the need for this event
      Ok(crate::TributaryEvent::TributaryRetired(_)) => {}
      Err(broadcast::error::RecvError::Lagged(_)) => {
        panic!("scan_tributaries lagged to handle tributary_event")
      }
      Err(broadcast::error::RecvError::Closed) => panic!("tributary_event sender closed"),
    }
  }
}