Add module to calculate medians

2025-12-14 06:59:24 +00:00 · 2025-11-25 20:21:45 -05:00
parent 8d8e8a7a77
commit 8ec0582237
11 changed files with 836 additions and 0 deletions
--- a/substrate/median/src/lib.rs
+++ b/substrate/median/src/lib.rs
@@ -0,0 +1,520 @@
+#![cfg_attr(docsrs, feature(doc_cfg))]
+#![doc = include_str!("../README.md")]
+#![cfg_attr(not(feature = "std"), no_std)]
+#![deny(missing_docs)]
+
+use core::cmp::Ordering;
+
+use scale::{EncodeLike, FullCodec};
+use frame_support::storage::*;
+
+mod lexicographic;
+pub use lexicographic::*;
+
+mod average;
+pub use average::*;
+
+mod policy;
+pub use policy::*;
+
+/// The store for a median.
+///
+/// `KeyPrefix` is accepted so a single set of storage values may be used to back multiple medians.
+/// For all `StorageDoubleMap`s however, the hasher of the second key MUST be the identity hasher.
+///
+/// For all storage values present, they MUST be considered opaque to the caller and left
+/// undisturbed. No assumptions may be made about their internal representation nor usage.
+/// Any names or documentation comments are solely for the review of the implementation
+/// itself, and are not intended to signify any potential layout nor use cases to the caller.
+/// ANY external usage has undefined behavior.
+pub trait MedianStore<KeyPrefix: FullCodec, MedianValue: Average + LexicographicEncoding> {
+  /// The policy to use when there are multiple candidate values.
+  const POLICY: Policy;
+
+  /// The amount of items currently present within the median's list.
+  type Length: StorageMap<KeyPrefix, u64, Query = u64>;
+
+  /// A store for the values currently present within the median.
+  ///
+  /// The value is the amount of instances of this value within the median's list.
+  type Store: IterableStorageDoubleMap<KeyPrefix, MedianValue::Encoding, u64, Query = u64>;
+
+  /// A secondary store for the values currently present within the median.
+  type ReverseStore: IterableStorageDoubleMap<
+    KeyPrefix,
+    LexicographicReverse<MedianValue>,
+    (),
+    Query = (),
+  >;
+
+  /// The position of the saved median within the list of values.
+  ///
+  /// This is necessary as when a value selected as the current median is present multiple times
+  /// within the list of values, the code does not know _which_ instance was selected as the
+  /// median, as necessary to know when to advance to a lesser/greater value. To resolve this, once
+  /// we know a value is the median value, we always set the position to the _first instance_ of
+  /// the value. This gives us a consistent frame of reference to decide the next steps of the
+  /// algorithm upon.
+  type Position: StorageMap<KeyPrefix, u64, Query = Option<u64>>;
+
+  /// The median value.
+  ///
+  /// This may drift from the actual median while an update is performed.
+  type Median: StorageMap<KeyPrefix, MedianValue, Query = Option<MedianValue>>;
+}
+
+const KEY_PREFIX_ASSERT: &str = "next value in storage had a different prefix associated";
+const AFTER_ASSERT: &str = "iterator yielding *after* key yielded key itself";
+
+/// Update the median.
+///
+/// This function may be called at any point to correctly calculate the current median. It will do
+/// so in an amount of operations linear to the distance from the stored median to the new median.
+///
+/// Since the distance is bounded by the amount of insertions to/removals from the median's list
+/// which have yet to be handled, the following `push` and `pop` functions achieve a constant
+/// amount of operations by calling this function _upon each and every invocation_. This leaves
+/// solely a singular insertion/removal needing to be handled, and a maximum distance of one.
+fn update_median<
+  KeyPrefix: FullCodec,
+  MedianValue: Average + LexicographicEncoding,
+  S: MedianStore<KeyPrefix, MedianValue>,
+>(
+  key_prefix: impl Copy + EncodeLike<KeyPrefix>,
+) {
+  let Some(mut existing_median_pos) = S::Position::get(key_prefix) else {
+    return;
+  };
+  let length = S::Length::get(key_prefix);
+  let target_median_pos = S::POLICY.target_median_pos(length);
+
+  let mut existing_median =
+    S::Median::get(key_prefix).expect("current position yet not current median");
+
+  // We first iterate up to the desired median position
+  {
+    let mut iter = {
+      let existing_median_key =
+        S::Store::hashed_key_for(key_prefix, existing_median.lexicographic_encode());
+      S::Store::iter_from(existing_median_key)
+    };
+
+    let mut existing_median_instances =
+      S::Store::get(key_prefix, existing_median.lexicographic_encode());
+    let mut next_value_first_pos;
+    while {
+      next_value_first_pos = existing_median_pos + existing_median_instances;
+      next_value_first_pos <= target_median_pos
+    } {
+      existing_median_pos = next_value_first_pos;
+      let (_key_prefix, next_value_encoding, next_value_instances) = iter
+        .next()
+        .expect("stored median was before the actual median yet no values were after it");
+      debug_assert_eq!(key_prefix.encode(), _key_prefix.encode(), "{KEY_PREFIX_ASSERT}");
+      debug_assert!(
+        existing_median.lexicographic_encode() != next_value_encoding,
+        "{AFTER_ASSERT}",
+      );
+      existing_median = MedianValue::lexicographic_decode(next_value_encoding);
+      existing_median_instances = next_value_instances;
+    }
+  }
+
+  // Then, we iterate down to the desired median position
+  {
+    let mut iter = {
+      let existing_median_key =
+        S::ReverseStore::hashed_key_for(key_prefix, LexicographicReverse::from(&existing_median));
+      S::ReverseStore::iter_keys_from(existing_median_key)
+    };
+
+    while existing_median_pos > target_median_pos {
+      let (_key_prefix, prior_value_encoding) = iter
+        .next()
+        .expect("stored median was before the actual median yet no values were after it");
+      debug_assert_eq!(key_prefix.encode(), _key_prefix.encode(), "{KEY_PREFIX_ASSERT}");
+      let prior_value = prior_value_encoding.into();
+      debug_assert!(prior_value != existing_median, "{AFTER_ASSERT}");
+      let prior_value_instances = S::Store::get(key_prefix, prior_value.lexicographic_encode());
+      existing_median = prior_value;
+      existing_median_pos -= prior_value_instances;
+    }
+  }
+
+  S::Position::set(key_prefix, Some(existing_median_pos));
+  S::Median::set(key_prefix, Some(existing_median));
+}
+
+/// A median.
+///
+/// The implementation only uses a constant amount of database operations to implement insertion
+/// and removal. When instantiated over a database with logarithmic complexities (such as a radix
+/// trie), this effects a median with logarithmic memory/computation complexities (not requiring
+/// loading all values into memory).
+///
+/// This SHOULD NOT be used for small collections where the linear (or even quadratic) complexities
+/// still out-perform how expensive database operations are. In those cases, the collection should
+/// be written to a single storage slot, read entirely, sorted, and the median should be
+/// immediately taken via indexing the value halfway through the collection.
+pub trait Median<KeyPrefix: FullCodec, MedianValue: Average + LexicographicEncoding>:
+  MedianStore<KeyPrefix, MedianValue>
+{
+  /// The current length of the median's list.
+  fn length(key_prefix: impl Copy + EncodeLike<KeyPrefix>) -> u64;
+
+  /// The current median value.
+  ///
+  /// This returns `None` if no values are present.
+  fn median(key_prefix: impl Copy + EncodeLike<KeyPrefix>) -> Option<MedianValue>;
+
+  /// Push a new value onto the median.
+  ///
+  /// If the value is already present within the existing values, the amount of times it will be
+  /// considered present will be incremented.
+  fn push(key_prefix: impl Copy + EncodeLike<KeyPrefix>, value: MedianValue);
+
+  /// Pop a value from the median.
+  ///
+  /// This returns `true` if the value was present and `false` otherwise.
+  ///
+  /// If the value is present within the existing values multiple times, only a single instance
+  /// will be removed.
+  fn pop(key_prefix: impl Copy + EncodeLike<KeyPrefix>, value: MedianValue) -> bool;
+}
+
+impl<
+    KeyPrefix: FullCodec,
+    MedianValue: Average + LexicographicEncoding,
+    S: MedianStore<KeyPrefix, MedianValue>,
+  > Median<KeyPrefix, MedianValue> for S
+{
+  fn length(key_prefix: impl Copy + EncodeLike<KeyPrefix>) -> u64 {
+    Self::Length::get(key_prefix)
+  }
+
+  /*
+    This function assumes `Position`, `Median` are up to date. This is guaranteed by
+    `update_median` being called after every single `push`, `pop` call, the only defined ways to
+    mutate the state.
+  */
+  fn median(key_prefix: impl Copy + EncodeLike<KeyPrefix>) -> Option<MedianValue> {
+    let mut current_median = Self::Median::get(key_prefix)?;
+    // If we're supposed to take the average, do so now
+    if matches!(S::POLICY, Policy::Average) {
+      let length = Self::length(key_prefix);
+      if (length % 2) == 0 {
+        // This will yield the target position for the lesser value in the pair
+        let target_median_pos_lo = Self::POLICY.target_median_pos(length);
+        let target_median_pos_hi = target_median_pos_lo + 1;
+
+        /*
+          We need to take the average of the current value and the next value, due to
+          `Policy::Average` internally being considered `Policy::Lesser` and solely differing here
+          when the median is fetched.
+
+          To fetch the next value, we first need to identify if `target_median_pos` points to the
+          _last instance_ of the currently selected median value. If it does not, then the next
+          value is another instance of this value, the average of them themselves, and we can
+          return now.
+
+          If `target_median_pos` does point to the last instance of the currently selected median
+          value, then we fetch the next key in our trie to learn the next value in order to take the
+          average.
+        */
+        let current_median_pos =
+          Self::Position::get(key_prefix).expect("current median yet no position");
+        let current_median_encoding = current_median.lexicographic_encode();
+        let inclusions = Self::Store::get(key_prefix, &current_median_encoding);
+        let start_pos_of_next_value = current_median_pos + inclusions;
+
+        // Short-circuit if we are averaging two of the same value
+        if target_median_pos_hi < start_pos_of_next_value {
+          return Some(current_median);
+        }
+
+        let current_median_key = Self::Store::hashed_key_for(key_prefix, &current_median_encoding);
+        let (_key_prefix, next_encoding) = Self::Store::iter_keys_from(current_median_key)
+          .next()
+          .expect("last value in storage yet looking for value after it");
+        debug_assert_eq!(key_prefix.encode(), _key_prefix.encode(), "{KEY_PREFIX_ASSERT}");
+        debug_assert!(current_median_encoding != next_encoding, "{AFTER_ASSERT}");
+        let next_value = MedianValue::lexicographic_decode(next_encoding);
+
+        current_median = MedianValue::average(current_median, next_value);
+      }
+    }
+    Some(current_median)
+  }
+
+  fn push(key_prefix: impl Copy + EncodeLike<KeyPrefix>, value: MedianValue) {
+    // Update the length
+    let existing_length = Self::Length::get(key_prefix);
+    let new_length = existing_length + 1;
+    Self::Length::set(key_prefix, new_length);
+
+    // Update the amount of inclusions
+    let encoding = value.lexicographic_encode();
+    {
+      let existing_presences = Self::Store::get(key_prefix, &encoding);
+      let new_presences = existing_presences + 1;
+      Self::Store::set(key_prefix, &encoding, new_presences);
+      if existing_presences == 0 {
+        Self::ReverseStore::set(key_prefix, LexicographicReverse::from_encoding(encoding), ());
+      }
+    }
+
+    // If this was the first value inserted, initialize and immediately return
+    if existing_length == 0 {
+      Self::Position::set(key_prefix, Some(0));
+      Self::Median::set(key_prefix, Some(value));
+      return;
+    }
+
+    // Fetch the current median
+    let existing_median =
+      Self::Median::get(key_prefix).expect("values within median yet no median");
+
+    // If this value was inserted before the current median, the current median's position has
+    // increased
+    if value < existing_median {
+      let mut existing_median_pos =
+        Self::Position::get(key_prefix).expect("values within median yet no current position");
+      existing_median_pos += 1;
+      Self::Position::set(key_prefix, Some(existing_median_pos));
+    }
+
+    // Update the median
+    update_median::<_, _, Self>(key_prefix);
+  }
+
+  fn pop(key_prefix: impl Copy + EncodeLike<KeyPrefix>, value: MedianValue) -> bool {
+    let encoding = value.lexicographic_encode();
+    let mut inclusions = Self::Store::get(key_prefix, &encoding);
+    if inclusions == 0 {
+      return false;
+    }
+
+    // Update the length
+    let existing_length = Self::Length::get(key_prefix);
+    let new_length = existing_length - 1;
+    Self::Length::set(key_prefix, new_length);
+
+    // Update the presence within the median's list
+    inclusions -= 1;
+    if inclusions == 0 {
+      Self::Store::remove(key_prefix, &encoding);
+      Self::ReverseStore::remove(key_prefix, LexicographicReverse::from_encoding(encoding));
+    } else {
+      Self::Store::set(key_prefix, encoding, inclusions);
+    }
+
+    let existing_median =
+      Self::Median::get(key_prefix).expect("values within median yet no median");
+    match value.cmp(&existing_median) {
+      Ordering::Less => {
+        let mut existing_median_pos =
+          Self::Position::get(key_prefix).expect("values within median yet no current position");
+        existing_median_pos -= 1;
+        Self::Position::set(key_prefix, Some(existing_median_pos));
+      }
+
+      Ordering::Equal if inclusions == 0 => {
+        /*
+          This value was the median, then removed, leaving `Median` and `Position` in an
+          ill-defined state. We attempt to consider `Position` as well-defined and solely update
+          `Median` to also be well-defined.
+
+          This works so long `Position` still refers to a valid position within the median's list.
+          It may not if the median's list started with length 1 or 2, where the current position
+          could have referred to the last element in the list, now popped.
+
+          If the length was 1, the list is now empty, triggering its own special case.
+
+          If the length was 2, we create a well-defined (and also accurate) definition for
+          `Position` and `Median` by setting them to the first (and only) item within
+          the list.
+        */
+        if new_length == 0 {
+          Self::Position::remove(key_prefix);
+          Self::Median::remove(key_prefix);
+        } else {
+          let mut existing_median_pos =
+            Self::Position::get(key_prefix).expect("values within median yet no current position");
+
+          let new_median_encoding = if existing_median_pos >= new_length {
+            /*
+              While resetting the declared median to the first item is always safe, so long as
+              `update_median` is called after (as done here), `update_median` has an algorithmic
+              complexity linear to the distance from the declared median to the correct median.
+              That means this can only be done, while maintaining the desired complexities, when a
+              bound is known on the distance from `0` to `target_median_pos`.
+
+              Since the list length is 1 in this case, per the reasoning above, the distance here
+              is `0`, making this a safe operation which also respects the desired complexities.
+            */
+            Self::Position::set(key_prefix, Some(0));
+            Self::Store::iter_key_prefix(key_prefix)
+              .next()
+              .expect("median list isn't empty yet has no values")
+          } else {
+            let existing_median_key =
+              Self::Store::hashed_key_for(key_prefix, existing_median.lexicographic_encode());
+            let (_key_prefix, next_value_encoding) =
+              Self::Store::iter_keys_from(existing_median_key)
+                .next()
+                .expect("current median wasn't the last value yet no value was after");
+            debug_assert_eq!(key_prefix.encode(), _key_prefix.encode(), "{KEY_PREFIX_ASSERT}");
+            debug_assert!(
+              existing_median.lexicographic_encode() != next_value_encoding,
+              "{AFTER_ASSERT}",
+            );
+            next_value_encoding
+          };
+
+          Self::Median::set(
+            key_prefix,
+            Some(MedianValue::lexicographic_decode(new_median_encoding)),
+          );
+        }
+      }
+
+      /*
+        If this value is an instance of the current median, for which some remain, we consider this
+        as removing an instance other than the first instance which is what the position refers to.
+        Accordingly, we don't have to update the position.
+
+        If this is greater than the current median, then its removal does not effect the position
+        of the current median.
+      */
+      Ordering::Equal | Ordering::Greater => {}
+    }
+
+    // Update the median
+    update_median::<_, _, Self>(key_prefix);
+
+    true
+  }
+}
+
+#[test]
+fn test_median() {
+  use frame_support::{
+    Blake2_128Concat, Identity,
+    storage::types::{self, ValueQuery, OptionQuery},
+  };
+
+  use rand_core::{RngCore, OsRng};
+
+  macro_rules! prefix {
+    ($name: ident, $prefix: expr) => {
+      struct $name;
+      impl frame_support::traits::StorageInstance for $name {
+        const STORAGE_PREFIX: &'static str = $prefix;
+        fn pallet_prefix() -> &'static str {
+          "median"
+        }
+      }
+    };
+  }
+  prefix!(PrefixLength, "Length");
+  prefix!(PrefixStore, "Store");
+  prefix!(PrefixReverse, "Reverse");
+  prefix!(PrefixPosition, "Position");
+  prefix!(PrefixMedian, "Median");
+
+  type StorageMapStruct<Prefix, Value, Query> =
+    types::StorageMap<Prefix, Blake2_128Concat, (), Value, Query>;
+  type StorageDoubleMapStruct<Prefix, Key, Value> =
+    types::StorageDoubleMap<Prefix, Blake2_128Concat, (), Identity, Key, Value, ValueQuery>;
+
+  macro_rules! test {
+    ($name: ident, $policy: expr) => {
+      struct $name;
+      impl MedianStore<(), u32> for $name {
+        const POLICY: Policy = $policy;
+        type Length = StorageMapStruct<PrefixLength, u64, ValueQuery>;
+        type Store =
+          StorageDoubleMapStruct<PrefixStore, <u32 as LexicographicEncoding>::Encoding, u64>;
+        type ReverseStore = StorageDoubleMapStruct<PrefixReverse, LexicographicReverse<u32>, ()>;
+        type Position = StorageMapStruct<PrefixPosition, u64, OptionQuery>;
+        type Median = StorageMapStruct<PrefixMedian, u32, OptionQuery>;
+      }
+
+      sp_io::TestExternalities::default().execute_with(|| {
+        assert_eq!($name::length(()), 0);
+        assert_eq!($name::median(()), None);
+
+        let mut current_list = vec![];
+        for i in 0 .. 1000 {
+          'reselect: loop {
+            // This chooses a modulus low enough this `match` will in fact match, yet high enough
+            // more cases can be added without forgetting to update it being an issue
+            match OsRng.next_u64() % 8 {
+              // Push a freshly sampled value
+              0 => {
+                #[allow(clippy::cast_possible_truncation)]
+                let push = OsRng.next_u64() as u32;
+                current_list.push(push);
+                current_list.sort();
+                $name::push((), push);
+              }
+              // Push an existing value
+              1 if !current_list.is_empty() => {
+                let i =
+                  usize::try_from(OsRng.next_u64() % u64::try_from(current_list.len()).unwrap())
+                    .unwrap();
+                let push = current_list[i];
+                current_list.push(push);
+                current_list.sort();
+                $name::push((), push);
+              }
+              // Remove an existing value
+              2 if !current_list.is_empty() => {
+                let i =
+                  usize::try_from(OsRng.next_u64() % u64::try_from(current_list.len()).unwrap())
+                    .unwrap();
+                let pop = current_list.remove(i);
+                assert!($name::pop((), pop));
+              }
+              // Remove a value which is not present
+              3 => {
+                #[allow(clippy::cast_possible_truncation)]
+                let pop = OsRng.next_u64() as u32;
+                if current_list.contains(&pop) {
+                  continue 'reselect;
+                }
+                assert!(!$name::pop((), pop));
+              }
+              _ => continue 'reselect,
+            }
+            break 'reselect;
+          }
+
+          assert_eq!(
+            $name::length(()),
+            u64::try_from(current_list.len()).unwrap(),
+            "length differs on iteration: {i}",
+          );
+          let target_median_pos =
+            $policy.target_median_pos(u64::try_from(current_list.len()).unwrap());
+          let target_median_pos = usize::try_from(target_median_pos).unwrap();
+          let expected = (!current_list.is_empty()).then(|| match $policy {
+            Policy::Greater | Policy::Lesser => current_list[target_median_pos],
+            Policy::Average => {
+              if (current_list.len() % 2) == 0 {
+                u32::average(current_list[target_median_pos], current_list[target_median_pos + 1])
+              } else {
+                current_list[target_median_pos]
+              }
+            }
+          });
+          assert_eq!($name::median(()), expected, "median differs on iteration: {i}");
+        }
+      });
+    };
+  }
+
+  test!(Greater, Policy::Greater);
+  test!(Lesser, Policy::Lesser);
+  test!(Average, Policy::Average);
+}