Implement variable-sized windows into multiexp

Closes https://github.com/serai-dex/serai/issues/17 by using the PrimeFieldBits API to do so. Should greatly speed up small batches, along with batches in the hundreds. Saves almost a full second on the cross-group DLEq proof.
2025-12-09 20:59:23 +00:00 · 2022-06-30 09:30:24 -04:00
parent 5d115f1e1c
commit 7890827a48
15 changed files with 342 additions and 148 deletions
--- a/crypto/multiexp/src/lib.rs
+++ b/crypto/multiexp/src/lib.rs
@@ -1,3 +1,4 @@
+use ff::PrimeFieldBits;
 use group::Group;

 mod straus;
@@ -11,39 +12,151 @@ mod batch;
 #[cfg(feature = "batch")]
 pub use batch::BatchVerifier;

-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-enum Algorithm {
-  Straus,
-  Pippenger
+#[cfg(test)]
+mod tests;
+
+pub(crate) fn prep_bits<G: Group>(
+  pairs: &[(G::Scalar, G)],
+  window: u8
+) -> Vec<Vec<u8>> where G::Scalar: PrimeFieldBits {
+  let w_usize = usize::from(window);
+
+  let mut groupings = vec![];
+  for pair in pairs {
+    let p = groupings.len();
+    let bits = pair.0.to_le_bits();
+    groupings.push(vec![0; (bits.len() + (w_usize - 1)) / w_usize]);
+
+    for (i, bit) in bits.into_iter().enumerate() {
+      let bit = bit as u8;
+      debug_assert_eq!(bit | 1, 1);
+      groupings[p][i / w_usize] |= bit << (i % w_usize);
+    }
+  }
+
+  groupings
 }

-fn algorithm(pairs: usize) -> Algorithm {
-  // TODO: Replace this with an actual formula determining which will use less additions
-  // Right now, Straus is used until 600, instead of the far more accurate 300, as Pippenger
-  // operates per byte instead of per nibble, and therefore requires a much longer series to be
-  // performant
-  // Technically, 800 is dalek's number for when to use byte Pippenger, yet given Straus's own
-  // implementation limitations...
-  if pairs < 600 {
-    Algorithm::Straus
+pub(crate) fn prep_tables<G: Group>(
+  pairs: &[(G::Scalar, G)],
+  window: u8
+) -> Vec<Vec<G>> {
+  let mut tables = Vec::with_capacity(pairs.len());
+  for pair in pairs {
+    let p = tables.len();
+    tables.push(vec![G::identity(); 2_usize.pow(window.into())]);
+    let mut accum = G::identity();
+    for i in 1 .. tables[p].len() {
+      accum += pair.1;
+      tables[p][i] = accum;
+    }
+  }
+  tables
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+enum Algorithm {
+  Straus(u8),
+  Pippenger(u8)
+}
+
+/*
+Release (with runs 20, so all of these are off by 20x):
+
+k256
+Straus 3 is more efficient at 5 with 678µs per
+Straus 4 is more efficient at 10 with 530µs per
+Straus 5 is more efficient at 35 with 467µs per
+
+Pippenger 5 is more efficient at 125 with 431µs per
+Pippenger 6 is more efficient at 275 with 349µs per
+Pippenger 7 is more efficient at 375 with 360µs per
+
+dalek
+Straus 3 is more efficient at 5 with 519µs per
+Straus 4 is more efficient at 10 with 376µs per
+Straus 5 is more efficient at 170 with 330µs per
+
+Pippenger 5 is more efficient at 125 with 305µs per
+Pippenger 6 is more efficient at 275 with 250µs per
+Pippenger 7 is more efficient at 450 with 205µs per
+Pippenger 8 is more efficient at 800 with 213µs per
+
+Debug (with runs 5, so...):
+
+k256
+Straus 3 is more efficient at 5 with 2532µs per
+Straus 4 is more efficient at 10 with 1930µs per
+Straus 5 is more efficient at 80 with 1632µs per
+
+Pippenger 5 is more efficient at 150 with 1441µs per
+Pippenger 6 is more efficient at 300 with 1235µs per
+Pippenger 7 is more efficient at 475 with 1182µs per
+Pippenger 8 is more efficient at 625 with 1170µs per
+
+dalek:
+Straus 3 is more efficient at 5 with 971µs per
+Straus 4 is more efficient at 10 with 782µs per
+Straus 5 is more efficient at 75 with 778µs per
+Straus 6 is more efficient at 165 with 867µs per
+
+Pippenger 5 is more efficient at 125 with 677µs per
+Pippenger 6 is more efficient at 250 with 655µs per
+Pippenger 7 is more efficient at 475 with 500µs per
+Pippenger 8 is more efficient at 875 with 499µs per
+*/
+fn algorithm(len: usize) -> Algorithm {
+  #[cfg(not(debug_assertions))]
+  if len < 10 {
+    // Straus 2 never showed a performance benefit, even with just 2 elements
+    Algorithm::Straus(3)
+  } else if len < 20 {
+    Algorithm::Straus(4)
+  } else if len < 50 {
+    Algorithm::Straus(5)
+  } else if len < 100 {
+    Algorithm::Pippenger(4)
+  } else if len < 125 {
+    Algorithm::Pippenger(5)
+  } else if len < 275 {
+    Algorithm::Pippenger(6)
+  } else if len < 400 {
+    Algorithm::Pippenger(7)
  } else {
-    Algorithm::Pippenger
+    Algorithm::Pippenger(8)
+  }
+
+  #[cfg(debug_assertions)]
+  if len < 10 {
+    Algorithm::Straus(3)
+  } else if len < 80 {
+    Algorithm::Straus(4)
+  } else if len < 100 {
+    Algorithm::Straus(5)
+  } else if len < 125 {
+    Algorithm::Pippenger(4)
+  } else if len < 275 {
+    Algorithm::Pippenger(5)
+  } else if len < 475 {
+    Algorithm::Pippenger(6)
+  } else if len < 750 {
+    Algorithm::Pippenger(7)
+  } else {
+    Algorithm::Pippenger(8)
  }
 }

 // Performs a multiexp, automatically selecting the optimal algorithm based on amount of pairs
-// Takes in an iterator of scalars and points, with a boolean for if the scalars are little endian
-// encoded in their Reprs or not
-pub fn multiexp<G: Group>(pairs: &[(G::Scalar, G)], little: bool) -> G {
+pub fn multiexp<G: Group>(pairs: &[(G::Scalar, G)]) -> G where G::Scalar: PrimeFieldBits {
  match algorithm(pairs.len()) {
-    Algorithm::Straus => straus(pairs, little),
-    Algorithm::Pippenger => pippenger(pairs, little)
+    Algorithm::Straus(window) => straus(pairs, window),
+    Algorithm::Pippenger(window) => pippenger(pairs, window)
  }
 }

-pub fn multiexp_vartime<G: Group>(pairs: &[(G::Scalar, G)], little: bool) -> G {
+pub fn multiexp_vartime<G: Group>(pairs: &[(G::Scalar, G)]) -> G where G::Scalar: PrimeFieldBits {
  match algorithm(pairs.len()) {
-    Algorithm::Straus => straus_vartime(pairs, little),
-    Algorithm::Pippenger => pippenger_vartime(pairs, little)
+    Algorithm::Straus(window) => straus_vartime(pairs, window),
+    Algorithm::Pippenger(window) => pippenger_vartime(pairs, window)
  }
 }