From c580dae2cdc254d1e9e2e52e37d1c9f187261f20 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Tue, 25 Feb 2025 18:23:14 -0800
Subject: [PATCH 01/12] Fix a bug in `BoxedUnsatInt::to_uint()`

---
 src/modular/safegcd/boxed.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/modular/safegcd/boxed.rs b/src/modular/safegcd/boxed.rs
index c2b3e4d6..78f8cf15 100644
--- a/src/modular/safegcd/boxed.rs
+++ b/src/modular/safegcd/boxed.rs
@@ -310,6 +310,9 @@ impl BoxedUnsatInt {
     /// Convert to a `BoxedUint` of the given precision.
     #[allow(trivial_numeric_casts)]
     fn to_uint(&self, mut bits_precision: u32) -> BoxedUint {
+        // Shorten to the required value after conversion.
+        let shorten = bits_precision == 32;
+
         // The current Bernstein-Yang implementation is natively 64-bit on all targets
         if bits_precision == 32 {
             bits_precision = 64;
@@ -334,7 +337,12 @@ impl BoxedUnsatInt {
             ret.as_words_mut()
         );
 
-        ret
+        if shorten {
+            debug_assert!(ret.bits_vartime() <= 32);
+            ret.shorten(32)
+        } else {
+            ret
+        }
     }
 
     /// Conditionally add the given value to this one depending on the given [`Choice`].

From 46013b902369f3ee081252ec0f28076e50992e9b Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Wed, 5 Mar 2025 12:54:52 -0800
Subject: [PATCH 02/12] Add `Monty::div_by_2_assign()`

---
 benches/monty.rs                | 11 +++++++++
 src/modular/boxed_monty_form.rs | 10 ++++++++
 src/modular/div_by_2.rs         | 43 +++++++++++++++------------------
 src/traits.rs                   |  6 +++++
 src/uint/boxed/shr.rs           |  7 ------
 tests/boxed_monty_form.rs       | 22 +++++++++++++++++
 6 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/benches/monty.rs b/benches/monty.rs
index d91375c8..37752da2 100644
--- a/benches/monty.rs
+++ b/benches/monty.rs
@@ -180,6 +180,17 @@ fn bench_montgomery_ops<M: Measurement>(group: &mut BenchmarkGroup<'_, M>) {
         )
     });
 
+    group.bench_function("div_by_2, U256", |b| {
+        b.iter_batched(
+            || {
+                let x = U256::random_mod(&mut rng, params.modulus().as_nz_ref());
+                MontyForm::new(&x, params)
+            },
+            |x| black_box(x.div_by_2()),
+            BatchSize::SmallInput,
+        )
+    });
+
     #[cfg(feature = "alloc")]
     for i in [1, 2, 3, 4, 10, 100] {
         group.bench_function(
diff --git a/src/modular/boxed_monty_form.rs b/src/modular/boxed_monty_form.rs
index fa68e517..8d3947b5 100644
--- a/src/modular/boxed_monty_form.rs
+++ b/src/modular/boxed_monty_form.rs
@@ -256,6 +256,12 @@ impl BoxedMontyForm {
             params: self.params.clone(),
         }
     }
+
+    /// Performs division by 2 inplace, that is finds `x` such that `x + x = self`
+    /// and writes it into `self`.
+    pub fn div_by_2_assign(&mut self) {
+        div_by_2::div_by_2_boxed_assign(&mut self.montgomery_form, &self.params.modulus)
+    }
 }
 
 impl Retrieve for BoxedMontyForm {
@@ -301,6 +307,10 @@ impl Monty for BoxedMontyForm {
         BoxedMontyForm::div_by_2(self)
     }
 
+    fn div_by_2_assign(&mut self) {
+        BoxedMontyForm::div_by_2_assign(self)
+    }
+
     fn lincomb_vartime(products: &[(&Self, &Self)]) -> Self {
         BoxedMontyForm::lincomb_vartime(products)
     }
diff --git a/src/modular/div_by_2.rs b/src/modular/div_by_2.rs
index c426c620..d8b2a758 100644
--- a/src/modular/div_by_2.rs
+++ b/src/modular/div_by_2.rs
@@ -1,6 +1,6 @@
 #[cfg(feature = "alloc")]
-use crate::{BoxedUint, ConstantTimeSelect};
-use crate::{Odd, Uint};
+use crate::{BoxedUint, Integer};
+use crate::{Limb, Odd, Uint};
 
 pub(crate) const fn div_by_2<const LIMBS: usize>(
     a: &Uint<LIMBS>,
@@ -10,38 +10,33 @@ pub(crate) const fn div_by_2<const LIMBS: usize>(
     // Two possibilities:
     // - if `a` is even, we can just divide by 2;
     // - if `a` is odd, we divide `(a + modulus)` by 2.
-    // To stay within the modulus we open the parentheses turning it into `a / 2 + modulus / 2 + 1`
-    // ("+1" because both `a` and `modulus` are odd, we lose 0.5 in each integer division).
-    // This will not overflow, so we can just use wrapping operations.
 
     // Note that this also works if `a` is a Montgomery representation modulo `modulus`
     // of some integer `x`.
     // If `b + b = a mod modulus` it means that `y + y = x mod modulus` where `y` is the integer
     // whose Montgomery representation is `b`.
 
-    let (half, is_odd) = a.shr1_with_carry();
-    let half_modulus = modulus.0.shr1();
-
-    let if_even = half;
-    let if_odd = half
-        .wrapping_add(&half_modulus)
-        .wrapping_add(&Uint::<LIMBS>::ONE);
-
-    Uint::<LIMBS>::select(&if_even, &if_odd, is_odd)
+    let is_odd = a.is_odd();
+    let (if_odd, carry) = a.adc(&modulus.0, Limb::ZERO);
+    let carry = Limb::select(Limb::ZERO, carry, is_odd);
+    Uint::<LIMBS>::select(a, &if_odd, is_odd)
+        .shr1()
+        .set_bit(Uint::<LIMBS>::BITS - 1, carry.is_nonzero())
 }
 
 #[cfg(feature = "alloc")]
 pub(crate) fn div_by_2_boxed(a: &BoxedUint, modulus: &Odd<BoxedUint>) -> BoxedUint {
-    debug_assert_eq!(a.bits_precision(), modulus.bits_precision());
-
-    let (mut half, is_odd) = a.shr1_with_carry();
-    let half_modulus = modulus.shr1();
-
-    let if_odd = half
-        .wrapping_add(&half_modulus)
-        .wrapping_add(&BoxedUint::one_with_precision(a.bits_precision()));
+    let mut result = a.clone();
+    div_by_2_boxed_assign(&mut result, modulus);
+    result
+}
 
-    half.ct_assign(&if_odd, is_odd);
+#[cfg(feature = "alloc")]
+pub(crate) fn div_by_2_boxed_assign(a: &mut BoxedUint, modulus: &Odd<BoxedUint>) {
+    debug_assert_eq!(a.bits_precision(), modulus.bits_precision());
 
-    half
+    let is_odd = a.is_odd();
+    let carry = a.conditional_adc_assign(modulus, is_odd);
+    a.shr1_assign();
+    a.set_bit(a.bits_precision() - 1, carry);
 }
diff --git a/src/traits.rs b/src/traits.rs
index 962b6f52..2c60c544 100644
--- a/src/traits.rs
+++ b/src/traits.rs
@@ -898,6 +898,12 @@ pub trait Monty:
     /// Performs division by 2, that is returns `x` such that `x + x = self`.
     fn div_by_2(&self) -> Self;
 
+    /// Performs division by 2 inplace, that is finds `x` such that `x + x = self`
+    /// and writes it into `self`.
+    fn div_by_2_assign(&mut self) {
+        *self = self.div_by_2()
+    }
+
     /// Calculate the sum of products of pairs `(a, b)` in `products`.
     ///
     /// This method is variable time only with the value of the modulus.
diff --git a/src/uint/boxed/shr.rs b/src/uint/boxed/shr.rs
index 84edd114..cf33a7f1 100644
--- a/src/uint/boxed/shr.rs
+++ b/src/uint/boxed/shr.rs
@@ -128,13 +128,6 @@ impl BoxedUint {
         success.map(|_| result)
     }
 
-    /// Computes `self >> 1` in constant-time, returning a true [`Choice`]
-    /// if the least significant bit was set, and a false [`Choice::FALSE`] otherwise.
-    pub(crate) fn shr1_with_carry(&self) -> (Self, Choice) {
-        let carry = self.limbs[0].0 & 1;
-        (self.shr1(), Choice::from(carry as u8))
-    }
-
     /// Computes `self >> 1` in constant-time.
     pub(crate) fn shr1(&self) -> Self {
         let mut ret = self.clone();
diff --git a/tests/boxed_monty_form.rs b/tests/boxed_monty_form.rs
index 2f4343ce..93eb0853 100644
--- a/tests/boxed_monty_form.rs
+++ b/tests/boxed_monty_form.rs
@@ -10,6 +10,7 @@ use crypto_bigint::{
     modular::{BoxedMontyForm, BoxedMontyParams},
 };
 use num_bigint::BigUint;
+use num_integer::Integer as _;
 use num_modular::ModularUnaryOps;
 use proptest::prelude::*;
 use std::cmp::Ordering;
@@ -153,4 +154,25 @@ proptest! {
 
         prop_assert_eq!(retrieve_biguint(&actual), expected);
     }
+
+    #[test]
+    fn div_by_2(a in monty_form()) {
+        let actual = a.div_by_2();
+        let mut actual_inplace = a.clone();
+        actual_inplace.div_by_2_assign();
+
+        let p = a.params().modulus();
+        let a_bi = retrieve_biguint(&a);
+        let p_bi = to_biguint(&p);
+
+        let expected = if a_bi.is_odd() {
+            (a_bi + p_bi) >> 1
+        }
+        else {
+            a_bi >> 1
+        };
+
+        prop_assert_eq!(&retrieve_biguint(&actual), &expected);
+        prop_assert_eq!(&retrieve_biguint(&actual_inplace), &expected);
+    }
 }

From dca8c0b06f4a0a44120e3aad2a97355c5c634da6 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Tue, 25 Feb 2025 11:28:03 -0800
Subject: [PATCH 03/12] More `inv_mod2k` methods and using them in
 `MontyParams` constructors

---
 src/modular/boxed_monty_form.rs | 33 +++++++++------
 src/modular/monty_form.rs       |  4 +-
 src/uint/boxed/inv_mod.rs       | 71 ++++++++++++++++++++++++++++++++-
 src/uint/boxed/sub.rs           |  5 +++
 src/uint/inv_mod.rs             | 40 +++++++++++++++++++
 tests/boxed_uint.rs             | 25 +++++++++++-
 6 files changed, 161 insertions(+), 17 deletions(-)

diff --git a/src/modular/boxed_monty_form.rs b/src/modular/boxed_monty_form.rs
index 8d3947b5..3693b20a 100644
--- a/src/modular/boxed_monty_form.rs
+++ b/src/modular/boxed_monty_form.rs
@@ -59,7 +59,24 @@ impl BoxedMontyParams {
             .rem(&modulus.as_nz_ref().widen(bits_precision * 2))
             .shorten(bits_precision);
 
-        Self::new_inner(modulus, one, r2)
+        // The modular inverse should always exist, because it was ensured odd above, which also ensures it's non-zero
+        let (inv_mod_limb, inv_mod_limb_exists) = modulus.inv_mod2k_vartime(Word::BITS);
+        debug_assert!(bool::from(inv_mod_limb_exists));
+
+        let mod_neg_inv = Limb(Word::MIN.wrapping_sub(inv_mod_limb.limbs[0].0));
+
+        let mod_leading_zeros = modulus.as_ref().leading_zeros().min(Word::BITS - 1);
+
+        let r3 = montgomery_reduction_boxed(&mut r2.square(), &modulus, mod_neg_inv);
+
+        Self {
+            modulus,
+            one,
+            r2,
+            r3,
+            mod_neg_inv,
+            mod_leading_zeros,
+        }
     }
 
     /// Instantiates a new set of [`BoxedMontyParams`] representing the given `modulus`, which
@@ -82,17 +99,9 @@ impl BoxedMontyParams {
             .rem_vartime(&modulus.as_nz_ref().widen(bits_precision * 2))
             .shorten(bits_precision);
 
-        Self::new_inner(modulus, one, r2)
-    }
-
-    /// Common functionality of `new` and `new_vartime`.
-    fn new_inner(modulus: Odd<BoxedUint>, one: BoxedUint, r2: BoxedUint) -> Self {
-        debug_assert_eq!(one.bits_precision(), modulus.bits_precision());
-        debug_assert_eq!(r2.bits_precision(), modulus.bits_precision());
-
-        // If the inverse exists, it means the modulus is odd.
-        let (inv_mod_limb, modulus_is_odd) = modulus.inv_mod2k(Word::BITS);
-        debug_assert!(bool::from(modulus_is_odd));
+        // The modular inverse should always exist, because it was ensured odd above, which also ensures it's non-zero
+        let (inv_mod_limb, inv_mod_limb_exists) = modulus.inv_mod2k_full_vartime(Word::BITS);
+        debug_assert!(bool::from(inv_mod_limb_exists));
 
         let mod_neg_inv = Limb(Word::MIN.wrapping_sub(inv_mod_limb.limbs[0].0));
 
diff --git a/src/modular/monty_form.rs b/src/modular/monty_form.rs
index 50b0eb12..964ea28e 100644
--- a/src/modular/monty_form.rs
+++ b/src/modular/monty_form.rs
@@ -55,7 +55,7 @@ where
 
         // The modular inverse should always exist, because it was ensured odd above, which also ensures it's non-zero
         let inv_mod = modulus
-            .inv_mod2k(Word::BITS)
+            .inv_mod2k_vartime(Word::BITS)
             .expect("modular inverse should exist");
 
         let mod_neg_inv = Limb(Word::MIN.wrapping_sub(inv_mod.limbs[0].0));
@@ -90,7 +90,7 @@ impl<const LIMBS: usize> MontyParams<LIMBS> {
 
         // The modular inverse should always exist, because it was ensured odd above, which also ensures it's non-zero
         let inv_mod = modulus
-            .inv_mod2k_vartime(Word::BITS)
+            .inv_mod2k_full_vartime(Word::BITS)
             .expect("modular inverse should exist");
 
         let mod_neg_inv = Limb(Word::MIN.wrapping_sub(inv_mod.limbs[0].0));
diff --git a/src/uint/boxed/inv_mod.rs b/src/uint/boxed/inv_mod.rs
index 06391a57..3f34d7d0 100644
--- a/src/uint/boxed/inv_mod.rs
+++ b/src/uint/boxed/inv_mod.rs
@@ -13,14 +13,78 @@ impl BoxedUint {
     }
 
     /// Computes 1/`self` mod `2^k`.
+    /// This method is variable w.r.t. `self` and `k`.
     ///
     /// If the inverse does not exist (`k > 0` and `self` is even),
     /// returns `Choice::FALSE` as the second element of the tuple,
     /// otherwise returns `Choice::TRUE`.
-    pub(crate) fn inv_mod2k(&self, k: u32) -> (Self, Choice) {
+    pub(crate) fn inv_mod2k_full_vartime(&self, k: u32) -> (Self, Choice) {
         let mut x = Self::zero_with_precision(self.bits_precision()); // keeps `x` during iterations
         let mut b = Self::one_with_precision(self.bits_precision()); // keeps `b_i` during iterations
 
+        // The inverse exists either if `k` is 0 or if `self` is odd.
+        if k != 0 && !bool::from(self.is_odd()) {
+            return (x, Choice::from(0));
+        }
+
+        for i in 0..k {
+            // X_i = b_i mod 2
+            let x_i = b.limbs[0].0 & 1;
+            // b_{i+1} = (b_i - a * X_i) / 2
+            if x_i != 0 {
+                b.wrapping_sub_assign(self);
+            }
+            b.shr1_assign();
+            // Store the X_i bit in the result (x = x | (1 << X_i))
+            x.set_bit_vartime(i, x_i != 0);
+        }
+
+        (x, Choice::from(1))
+    }
+
+    /// Computes 1/`self` mod `2^k`.
+    /// This method is constant-time w.r.t. `self` but not `k`.
+    ///
+    /// If the inverse does not exist (`k > 0` and `self` is even),
+    /// returns `Choice::FALSE` as the second element of the tuple,
+    /// otherwise returns `Choice::TRUE`.
+    pub fn inv_mod2k_vartime(&self, k: u32) -> (Self, Choice) {
+        let mut x = Self::zero_with_precision(self.bits_precision()); // keeps `x` during iterations
+        let mut b = Self::one_with_precision(self.bits_precision()); // keeps `b_i` during iterations
+        // Additional temporary storage we will need.
+        let mut b_opt = Self::zero_with_precision(self.bits_precision());
+
+        // The inverse exists either if `k` is 0 or if `self` is odd.
+        let is_some = k.ct_eq(&0) | self.is_odd();
+
+        for i in 0..k {
+            // X_i = b_i mod 2
+            let x_i = b.limbs[0].0 & 1;
+            let x_i_choice = Choice::from(x_i as u8);
+            // b_{i+1} = (b_i - a * X_i) / 2
+            b_opt.as_words_mut().copy_from_slice(b.as_words());
+            b_opt.wrapping_sub_assign(self);
+            b.ct_assign(&b_opt, x_i_choice);
+            b.shr1_assign();
+
+            // Store the X_i bit in the result (x = x | (1 << X_i))
+            x.set_bit(i, x_i_choice);
+        }
+
+        (x, is_some)
+    }
+
+    /// Computes 1/`self` mod `2^k`.
+    ///
+    /// If the inverse does not exist (`k > 0` and `self` is even),
+    /// returns `Choice::FALSE` as the second element of the tuple,
+    /// otherwise returns `Choice::TRUE`.
+    pub fn inv_mod2k(&self, k: u32) -> (Self, Choice) {
+        let mut x = Self::zero_with_precision(self.bits_precision()); // keeps `x` during iterations
+        let mut b = Self::one_with_precision(self.bits_precision()); // keeps `b_i` during iterations
+        // Additional temporary storage we will need.
+        let mut b_opt = Self::zero_with_precision(self.bits_precision());
+
         // The inverse exists either if `k` is 0 or if `self` is odd.
         let is_some = k.ct_eq(&0) | self.is_odd();
 
@@ -33,7 +97,10 @@ impl BoxedUint {
             let x_i = b.limbs[0].0 & 1;
             let x_i_choice = Choice::from(x_i as u8);
             // b_{i+1} = (b_i - a * X_i) / 2
-            b = Self::ct_select(&b, &b.wrapping_sub(self), x_i_choice).shr1();
+            b_opt.as_words_mut().copy_from_slice(b.as_words());
+            b_opt.wrapping_sub_assign(self);
+            b.ct_assign(&b_opt, x_i_choice);
+            b.shr1_assign();
 
             // Store the X_i bit in the result (x = x | (1 << X_i))
             // Don't change the result in dummy iterations.
diff --git a/src/uint/boxed/sub.rs b/src/uint/boxed/sub.rs
index 411123c0..bd0c1d08 100644
--- a/src/uint/boxed/sub.rs
+++ b/src/uint/boxed/sub.rs
@@ -27,6 +27,11 @@ impl BoxedUint {
         borrow
     }
 
+    /// Perform wrapping subtraction inplace, discarding overflow.
+    pub(crate) fn wrapping_sub_assign(&mut self, rhs: &Self) {
+        self.sbb_assign(rhs, Limb::ZERO);
+    }
+
     /// Perform wrapping subtraction, discarding overflow.
     pub fn wrapping_sub(&self, rhs: &Self) -> Self {
         self.sbb(rhs, Limb::ZERO).0
diff --git a/src/uint/inv_mod.rs b/src/uint/inv_mod.rs
index b818f411..235aa923 100644
--- a/src/uint/inv_mod.rs
+++ b/src/uint/inv_mod.rs
@@ -5,6 +5,46 @@ use crate::{
 use subtle::CtOption;
 
 impl<const LIMBS: usize> Uint<LIMBS> {
+    /// Computes 1/`self` mod `2^k`.
+    /// This method is variable w.r.t. `self` and `k`.
+    ///
+    /// If the inverse does not exist (`k > 0` and `self` is even),
+    /// returns `ConstChoice::FALSE` as the second element of the tuple,
+    /// otherwise returns `ConstChoice::TRUE`.
+    pub(crate) const fn inv_mod2k_full_vartime(&self, k: u32) -> Option<Self> {
+        // Using the Algorithm 3 from "A Secure Algorithm for Inversion Modulo 2k"
+        // by Sadiel de la Fe and Carles Ferrer.
+        // See <https://www.mdpi.com/2410-387X/2/3/23>.
+
+        // Note that we are not using Alrgorithm 4, since we have a different approach
+        // of enforcing constant-timeness w.r.t. `self`.
+
+        let mut x = Self::ZERO; // keeps `x` during iterations
+        let mut b = Self::ONE; // keeps `b_i` during iterations
+        let mut i = 0;
+
+        // The inverse exists either if `k` is 0 or if `self` is odd.
+        if k != 0 && !self.is_odd().to_bool_vartime() {
+            return None;
+        }
+
+        while i < k {
+            // X_i = b_i mod 2
+            let x_i = b.limbs[0].0 & 1;
+            // b_{i+1} = (b_i - a * X_i) / 2
+            if x_i != 0 {
+                b = b.wrapping_sub(self);
+            }
+            b = b.shr1();
+            // Store the X_i bit in the result (x = x | (1 << X_i))
+            x = x.set_bit_vartime(i, x_i != 0);
+
+            i += 1;
+        }
+
+        Some(x)
+    }
+
     /// Computes 1/`self` mod `2^k`.
     /// This method is constant-time w.r.t. `self` but not `k`.
     ///
diff --git a/tests/boxed_uint.rs b/tests/boxed_uint.rs
index 652a81c6..ccacea40 100644
--- a/tests/boxed_uint.rs
+++ b/tests/boxed_uint.rs
@@ -6,12 +6,13 @@ mod common;
 
 use common::to_biguint;
 use core::cmp::Ordering;
-use crypto_bigint::{BoxedUint, CheckedAdd, Gcd, Integer, Limb, NonZero};
+use crypto_bigint::{BitOps, BoxedUint, CheckedAdd, Gcd, Integer, Limb, NonZero};
 use num_bigint::BigUint;
 use num_integer::Integer as _;
 use num_modular::ModularUnaryOps;
 use num_traits::identities::One;
 use proptest::prelude::*;
+use subtle::Choice;
 
 fn to_uint(big_uint: BigUint) -> BoxedUint {
     let bytes = big_uint.to_bytes_be();
@@ -155,6 +156,28 @@ proptest! {
         prop_assert_eq!(expected, actual);
     }
 
+    #[test]
+    fn inv_mod2k(mut a in uint(), k in any::<u32>()) {
+        a.set_bit(0, Choice::from(1)); // make odd
+        let k = k % (a.bits() + 1);
+        let a_bi = to_biguint(&a);
+        let m_bi = BigUint::one() << k as usize;
+
+        let actual = a.inv_mod2k(k).0;
+        let (actual_vartime, exists) = a.inv_mod2k_vartime(k);
+        prop_assert!(bool::from(exists));
+        prop_assert_eq!(&actual, &actual_vartime);
+
+        if k == 0 {
+            prop_assert_eq!(&actual, &BoxedUint::zero_with_precision(a.bits_precision()));
+        }
+        else {
+            let inv_bi = to_biguint(&actual);
+            let res = (inv_bi * a_bi) % m_bi;
+            prop_assert_eq!(res, BigUint::one());
+        }
+    }
+
     #[test]
     fn mod_inv((a, mut b) in uint_pair()) {
         if b.is_even().into() {

From c820291656fcf122c5b12266fa9d9fdb0e555b76 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Fri, 28 Feb 2025 14:50:26 -0800
Subject: [PATCH 04/12] Make MontyForm constructors const

---
 src/modular/monty_form.rs | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/modular/monty_form.rs b/src/modular/monty_form.rs
index 964ea28e..84910592 100644
--- a/src/modular/monty_form.rs
+++ b/src/modular/monty_form.rs
@@ -14,7 +14,7 @@ use super::{
     div_by_2::div_by_2,
     reduction::montgomery_reduction,
 };
-use crate::{Concat, Limb, Monty, NonZero, Odd, Split, Uint, Word};
+use crate::{Concat, ConstChoice, Limb, Monty, NonZero, Odd, Split, Uint, Word};
 use subtle::{Choice, ConditionallySelectable, ConstantTimeEq};
 
 /// Parameters to efficiently go to/from the Montgomery form for an odd modulus provided at runtime.
@@ -41,7 +41,7 @@ where
     Uint<WIDE_LIMBS>: Split<Output = Uint<LIMBS>>,
 {
     /// Instantiates a new set of `MontyParams` representing the given odd `modulus`.
-    pub fn new(modulus: Odd<Uint<LIMBS>>) -> Self {
+    pub const fn new(modulus: Odd<Uint<LIMBS>>) -> Self {
         // `R mod modulus` where `R = 2^BITS`.
         // Represents 1 in Montgomery form.
         let one = Uint::MAX.rem(modulus.as_nz_ref()).wrapping_add(&Uint::ONE);
@@ -55,12 +55,15 @@ where
 
         // The modular inverse should always exist, because it was ensured odd above, which also ensures it's non-zero
         let inv_mod = modulus
+            .as_ref()
             .inv_mod2k_vartime(Word::BITS)
             .expect("modular inverse should exist");
 
         let mod_neg_inv = Limb(Word::MIN.wrapping_sub(inv_mod.limbs[0].0));
 
-        let mod_leading_zeros = modulus.as_ref().leading_zeros().min(Word::BITS - 1);
+        let mod_leading_zeros = modulus.as_ref().leading_zeros();
+        let mod_leading_zeros = ConstChoice::from_u32_lt(mod_leading_zeros, Word::BITS - 1)
+            .select_u32(Word::BITS - 1, mod_leading_zeros);
 
         // `R^3 mod modulus`, used for inversion in Montgomery form.
         let r3 = montgomery_reduction(&r2.square_wide(), &modulus, mod_neg_inv);
@@ -78,7 +81,7 @@ where
 
 impl<const LIMBS: usize> MontyParams<LIMBS> {
     /// Instantiates a new set of `MontyParams` representing the given odd `modulus`.
-    pub fn new_vartime(modulus: Odd<Uint<LIMBS>>) -> Self {
+    pub const fn new_vartime(modulus: Odd<Uint<LIMBS>>) -> Self {
         // `R mod modulus` where `R = 2^BITS`.
         // Represents 1 in Montgomery form.
         let one = Uint::MAX
@@ -90,12 +93,18 @@ impl<const LIMBS: usize> MontyParams<LIMBS> {
 
         // The modular inverse should always exist, because it was ensured odd above, which also ensures it's non-zero
         let inv_mod = modulus
+            .as_ref()
             .inv_mod2k_full_vartime(Word::BITS)
             .expect("modular inverse should exist");
 
         let mod_neg_inv = Limb(Word::MIN.wrapping_sub(inv_mod.limbs[0].0));
 
-        let mod_leading_zeros = modulus.as_ref().leading_zeros_vartime().min(Word::BITS - 1);
+        let mod_leading_zeros = modulus.as_ref().leading_zeros_vartime();
+        let mod_leading_zeros = if mod_leading_zeros < Word::BITS - 1 {
+            mod_leading_zeros
+        } else {
+            Word::BITS - 1
+        };
 
         // `R^3 mod modulus`, used for inversion in Montgomery form.
         let r3 = montgomery_reduction(&r2.square_wide(), &modulus, mod_neg_inv);

From 0abfc4b0e04acf6de9955660f8a6b02934b300a1 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Tue, 25 Feb 2025 12:09:04 -0800
Subject: [PATCH 05/12] Use `sub_assign_mod_with_carry()` in
 `montgomery_reduction_boxed_mut()`

---
 src/modular/reduction.rs | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/modular/reduction.rs b/src/modular/reduction.rs
index d6d15b55..fc62a317 100644
--- a/src/modular/reduction.rs
+++ b/src/modular/reduction.rs
@@ -3,7 +3,7 @@
 use crate::{Limb, Odd, Uint};
 
 #[cfg(feature = "alloc")]
-use {crate::BoxedUint, subtle::Choice};
+use crate::BoxedUint;
 
 /// Algorithm 14.32 in Handbook of Applied Cryptography <https://cacr.uwaterloo.ca/hac/about/chap14.pdf>
 #[inline(always)]
@@ -84,15 +84,11 @@ pub(crate) fn montgomery_reduction_boxed_mut(
     let (lower, upper) = x.limbs.split_at_mut(modulus.nlimbs());
     let meta_carry = montgomery_reduction_inner(upper, lower, &modulus.limbs, mod_neg_inv);
 
+    // Division is simply taking the upper half of the limbs
+    // Final reduction (at this point, the value is at most 2 * modulus,
+    // so `meta_carry` is either 0 or 1)
     out.limbs.copy_from_slice(upper);
-    let borrow = out.sbb_assign(modulus, Limb::ZERO);
-
-    // The new `borrow = Word::MAX` iff `carry == 0` and `borrow == Word::MAX`.
-    let borrow = Limb((!meta_carry.0.wrapping_neg()) & borrow.0);
-
-    // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise
-    // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the modulus.
-    out.conditional_adc_assign(modulus, Choice::from((borrow.0 & 1) as u8));
+    out.sub_assign_mod_with_carry(meta_carry, modulus, modulus);
 }
 
 /// Algorithm 14.32 in Handbook of Applied Cryptography <https://cacr.uwaterloo.ca/hac/about/chap14.pdf>

From 0cc98266e497b44d52e3bb8c5784b3fd3838bc09 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Tue, 25 Feb 2025 12:09:19 -0800
Subject: [PATCH 06/12] Get rid of allocations in Add/SubAssign for
 BoxedMontyForm

---
 src/modular/boxed_monty_form/add.rs |  5 ++---
 src/modular/boxed_monty_form/sub.rs |  9 ++++++---
 src/uint/boxed/add_mod.rs           | 18 +++++++++++++-----
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/modular/boxed_monty_form/add.rs b/src/modular/boxed_monty_form/add.rs
index 85737195..42190609 100644
--- a/src/modular/boxed_monty_form/add.rs
+++ b/src/modular/boxed_monty_form/add.rs
@@ -58,9 +58,8 @@ impl Add<BoxedMontyForm> for BoxedMontyForm {
 impl AddAssign<&BoxedMontyForm> for BoxedMontyForm {
     fn add_assign(&mut self, rhs: &BoxedMontyForm) {
         debug_assert_eq!(self.params, rhs.params);
-        self.montgomery_form = self
-            .montgomery_form
-            .add_mod(&rhs.montgomery_form, &self.params.modulus)
+        self.montgomery_form
+            .add_mod_assign(&rhs.montgomery_form, &self.params.modulus);
     }
 }
 
diff --git a/src/modular/boxed_monty_form/sub.rs b/src/modular/boxed_monty_form/sub.rs
index dc777d0c..3524da91 100644
--- a/src/modular/boxed_monty_form/sub.rs
+++ b/src/modular/boxed_monty_form/sub.rs
@@ -1,6 +1,7 @@
 //! Subtractions between boxed integers in Montgomery form.
 
 use super::BoxedMontyForm;
+use crate::Limb;
 use core::ops::{Sub, SubAssign};
 
 impl BoxedMontyForm {
@@ -51,9 +52,11 @@ impl Sub<BoxedMontyForm> for BoxedMontyForm {
 impl SubAssign<&BoxedMontyForm> for BoxedMontyForm {
     fn sub_assign(&mut self, rhs: &BoxedMontyForm) {
         debug_assert_eq!(self.params, rhs.params);
-        self.montgomery_form = self
-            .montgomery_form
-            .sub_mod(&rhs.montgomery_form, &self.params.modulus)
+        self.montgomery_form.sub_assign_mod_with_carry(
+            Limb::ZERO,
+            &rhs.montgomery_form,
+            &self.params.modulus,
+        );
     }
 }
 
diff --git a/src/uint/boxed/add_mod.rs b/src/uint/boxed/add_mod.rs
index 12bade5b..5b0054ce 100644
--- a/src/uint/boxed/add_mod.rs
+++ b/src/uint/boxed/add_mod.rs
@@ -7,22 +7,30 @@ impl BoxedUint {
     ///
     /// Assumes `self + rhs` as unbounded integer is `< 2p`.
     pub fn add_mod(&self, rhs: &Self, p: &Self) -> Self {
+        let mut result = self.clone();
+        result.add_mod_assign(rhs, p);
+        result
+    }
+
+    /// Computes `self + rhs mod p` and writes the result in `self`.
+    ///
+    /// Assumes `self + rhs` as unbounded integer is `< 2p`.
+    pub fn add_mod_assign(&mut self, rhs: &Self, p: &Self) {
         debug_assert_eq!(self.bits_precision(), p.bits_precision());
         debug_assert_eq!(rhs.bits_precision(), p.bits_precision());
-        debug_assert!(self < p);
+        debug_assert!(&*self < p);
         debug_assert!(rhs < p);
 
-        let (mut w, carry) = self.adc(rhs, Limb::ZERO);
+        let carry = self.adc_assign(rhs, Limb::ZERO);
 
         // Attempt to subtract the modulus, to ensure the result is in the field.
-        let borrow = w.sbb_assign(p, Limb::ZERO);
+        let borrow = self.sbb_assign(p, Limb::ZERO);
         let (_, borrow) = carry.sbb(Limb::ZERO, borrow);
 
         // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise
         // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the
         // modulus.
-        w.conditional_adc_assign(p, !borrow.is_zero());
-        w
+        self.conditional_adc_assign(p, !borrow.is_zero());
     }
 
     /// Computes `self + self mod p`.

From 354efe6727a06d2d17c3fbe1f87f9ff145870026 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Tue, 25 Feb 2025 12:59:54 -0800
Subject: [PATCH 07/12] Use AMM for BoxedMontyForm multiplication

---
 src/modular/boxed_monty_form/mul.rs | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/modular/boxed_monty_form/mul.rs b/src/modular/boxed_monty_form/mul.rs
index d8cc1676..961f5f8c 100644
--- a/src/modular/boxed_monty_form/mul.rs
+++ b/src/modular/boxed_monty_form/mul.rs
@@ -6,10 +6,7 @@
 //! Originally (c) 2014 The Rust Project Developers, dual licensed Apache 2.0+MIT.
 
 use super::{BoxedMontyForm, BoxedMontyParams};
-use crate::{
-    BoxedUint, Limb, Square, SquareAssign, Word, Zero,
-    modular::reduction::montgomery_reduction_boxed_mut, uint::mul::mul_limbs,
-};
+use crate::{BoxedUint, Limb, Square, SquareAssign, Word, Zero};
 use core::{
     borrow::Borrow,
     ops::{Mul, MulAssign},
@@ -132,11 +129,8 @@ impl<'a> MontyMultiplier<'a> {
 
     /// Perform a Montgomery multiplication, assigning a fully reduced result to `a`.
     pub(super) fn mul_assign(&mut self, a: &mut BoxedUint, b: &BoxedUint) {
-        debug_assert_eq!(a.bits_precision(), self.modulus.bits_precision());
-        debug_assert_eq!(b.bits_precision(), self.modulus.bits_precision());
-
-        mul_limbs(&a.limbs, &b.limbs, &mut self.product.limbs);
-        montgomery_reduction_boxed_mut(&mut self.product, self.modulus, self.mod_neg_inv, a);
+        self.mul_amm_assign(a, b);
+        a.sub_assign_mod_with_carry(Limb::ZERO, self.modulus, self.modulus);
 
         debug_assert!(&*a < self.modulus);
     }
@@ -150,11 +144,8 @@ impl<'a> MontyMultiplier<'a> {
 
     /// Perform a squaring using Montgomery multiplication, assigning a fully reduced result to `a`.
     pub(super) fn square_assign(&mut self, a: &mut BoxedUint) {
-        debug_assert_eq!(a.bits_precision(), self.modulus.bits_precision());
-
-        // TODO(tarcieri): optimized implementation
-        mul_limbs(&a.limbs, &a.limbs, &mut self.product.limbs);
-        montgomery_reduction_boxed_mut(&mut self.product, self.modulus, self.mod_neg_inv, a);
+        self.square_amm_assign(a);
+        a.sub_assign_mod_with_carry(Limb::ZERO, self.modulus, self.modulus);
 
         debug_assert!(&*a < self.modulus);
     }
@@ -211,6 +202,7 @@ impl<'a> MontyMultiplier<'a> {
     pub(super) fn square_amm_assign(&mut self, a: &mut BoxedUint) {
         debug_assert_eq!(a.bits_precision(), self.modulus.bits_precision());
 
+        // TODO(tarcieri): optimized implementation
         self.clear_product();
         almost_montgomery_mul(
             self.product.as_limbs_mut(),

From c1dd82363dfa1716bbd04be486e1071afd49ef56 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Tue, 25 Feb 2025 12:27:05 -0800
Subject: [PATCH 08/12] Use AMM for conversion to Montgomery and in
 `BoxedMontyParams` constructor

---
 src/modular/boxed_monty_form.rs | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/modular/boxed_monty_form.rs b/src/modular/boxed_monty_form.rs
index 3693b20a..7429ca5d 100644
--- a/src/modular/boxed_monty_form.rs
+++ b/src/modular/boxed_monty_form.rs
@@ -8,10 +8,9 @@ mod neg;
 mod pow;
 mod sub;
 
-use super::{
-    ConstMontyParams, Retrieve, div_by_2,
-    reduction::{montgomery_reduction_boxed, montgomery_reduction_boxed_mut},
-};
+use super::{ConstMontyParams, Retrieve, div_by_2, reduction::montgomery_reduction_boxed};
+use mul::MontyMultiplier;
+
 use crate::{BoxedUint, Limb, Monty, Odd, Word};
 use alloc::sync::Arc;
 use subtle::Choice;
@@ -67,7 +66,10 @@ impl BoxedMontyParams {
 
         let mod_leading_zeros = modulus.as_ref().leading_zeros().min(Word::BITS - 1);
 
-        let r3 = montgomery_reduction_boxed(&mut r2.square(), &modulus, mod_neg_inv);
+        let r3 = {
+            let mut mm = MontyMultiplier::new(&modulus, mod_neg_inv);
+            mm.square(&r2)
+        };
 
         Self {
             modulus,
@@ -107,7 +109,10 @@ impl BoxedMontyParams {
 
         let mod_leading_zeros = modulus.as_ref().leading_zeros().min(Word::BITS - 1);
 
-        let r3 = montgomery_reduction_boxed(&mut r2.square(), &modulus, mod_neg_inv);
+        let r3 = {
+            let mut mm = MontyMultiplier::new(&modulus, mod_neg_inv);
+            mm.square(&r2)
+        };
 
         Self {
             modulus,
@@ -336,11 +341,8 @@ impl Zeroize for BoxedMontyForm {
 /// Convert the given integer into the Montgomery domain.
 #[inline]
 fn convert_to_montgomery(integer: &mut BoxedUint, params: &BoxedMontyParams) {
-    let mut product = integer.mul(&params.r2);
-    montgomery_reduction_boxed_mut(&mut product, &params.modulus, params.mod_neg_inv, integer);
-
-    #[cfg(feature = "zeroize")]
-    product.zeroize();
+    let mut mm = MontyMultiplier::from(params);
+    mm.mul_assign(integer, &params.r2);
 }
 
 #[cfg(test)]

From d62e139e1d263df9ce312e846642509020ab2763 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Wed, 5 Mar 2025 10:33:24 -0800
Subject: [PATCH 09/12] Use AMM in BoxedMontyForm::retrieve

---
 src/modular/boxed_monty_form.rs     | 17 ++-----
 src/modular/boxed_monty_form/mul.rs | 69 ++++++++++++++++++++++++++++-
 src/modular/reduction.rs            | 41 -----------------
 3 files changed, 71 insertions(+), 56 deletions(-)

diff --git a/src/modular/boxed_monty_form.rs b/src/modular/boxed_monty_form.rs
index 7429ca5d..5a414c6c 100644
--- a/src/modular/boxed_monty_form.rs
+++ b/src/modular/boxed_monty_form.rs
@@ -8,7 +8,7 @@ mod neg;
 mod pow;
 mod sub;
 
-use super::{ConstMontyParams, Retrieve, div_by_2, reduction::montgomery_reduction_boxed};
+use super::{ConstMontyParams, Retrieve, div_by_2};
 use mul::MontyMultiplier;
 
 use crate::{BoxedUint, Limb, Monty, Odd, Word};
@@ -187,19 +187,8 @@ impl BoxedMontyForm {
 
     /// Retrieves the integer currently encoded in this [`BoxedMontyForm`], guaranteed to be reduced.
     pub fn retrieve(&self) -> BoxedUint {
-        let mut montgomery_form = self.montgomery_form.widen(self.bits_precision() * 2);
-
-        let ret = montgomery_reduction_boxed(
-            &mut montgomery_form,
-            &self.params.modulus,
-            self.params.mod_neg_inv,
-        );
-
-        #[cfg(feature = "zeroize")]
-        montgomery_form.zeroize();
-
-        debug_assert!(ret < self.params.modulus);
-        ret
+        let mut mm = MontyMultiplier::from(self.params.as_ref());
+        mm.mul_by_one(&self.montgomery_form)
     }
 
     /// Instantiates a new `ConstMontyForm` that represents zero.
diff --git a/src/modular/boxed_monty_form/mul.rs b/src/modular/boxed_monty_form/mul.rs
index 961f5f8c..cd111498 100644
--- a/src/modular/boxed_monty_form/mul.rs
+++ b/src/modular/boxed_monty_form/mul.rs
@@ -6,7 +6,7 @@
 //! Originally (c) 2014 The Rust Project Developers, dual licensed Apache 2.0+MIT.
 
 use super::{BoxedMontyForm, BoxedMontyParams};
-use crate::{BoxedUint, Limb, Square, SquareAssign, Word, Zero};
+use crate::{BoxedUint, ConstChoice, Limb, Square, SquareAssign, Word, Zero};
 use core::{
     borrow::Borrow,
     ops::{Mul, MulAssign},
@@ -135,6 +135,26 @@ impl<'a> MontyMultiplier<'a> {
         debug_assert!(&*a < self.modulus);
     }
 
+    /// Perform a Montgomery multiplication, assigning a fully reduced result to `a`.
+    pub(super) fn mul_by_one(&mut self, a: &BoxedUint) -> BoxedUint {
+        debug_assert_eq!(a.bits_precision(), self.modulus.bits_precision());
+
+        let mut ret = a.clone();
+
+        self.clear_product();
+        almost_montgomery_mul_by_one(
+            self.product.as_limbs_mut(),
+            a.as_limbs(),
+            self.modulus.as_limbs(),
+            self.mod_neg_inv,
+        );
+        ret.limbs
+            .copy_from_slice(&self.product.limbs[..a.limbs.len()]);
+        ret.sub_assign_mod_with_carry(Limb::ZERO, self.modulus, self.modulus);
+
+        ret
+    }
+
     /// Perform a squaring using Montgomery multiplication, returning a fully reduced result.
     pub(super) fn square(&mut self, a: &BoxedUint) -> BoxedUint {
         let mut ret = a.clone();
@@ -279,6 +299,53 @@ fn almost_montgomery_mul(z: &mut [Limb], x: &[Limb], y: &[Limb], m: &[Limb], k:
     }
 }
 
+/// Same as `almost_montgomery_mul` with `y == 1`.
+///
+/// Used for retrieving from Montgomery form.
+fn almost_montgomery_mul_by_one(z: &mut [Limb], x: &[Limb], m: &[Limb], k: Limb) {
+    // This code assumes x, m are all the same length (required by addMulVVW and the for loop).
+    // It also assumes that x is already reduced mod m, or else the result will not be properly
+    // reduced.
+    let n = m.len();
+
+    // This preconditions check allows compiler to remove bound checks later in the code.
+    // `z.len() > n && z[n..].len() == n` is used intentionally instead of `z.len() == 2* n`
+    // since the latter prevents compiler from removing some bound checks.
+    let pre_cond = z.len() > n && z[n..].len() == n && x.len() == n && m.len() == n;
+    if !pre_cond {
+        panic!("Failed preconditions in montgomery_mul");
+    }
+
+    let mut c = ConstChoice::FALSE;
+
+    // The unrolled first iteration.
+    let c2 = add_mul_vvw(&mut z[0..n], x, Limb::ONE);
+    let t = z[0].wrapping_mul(k);
+    let c3 = add_mul_vvw(&mut z[0..n], m, t);
+    let cx = c2.wrapping_add(Limb(c.to_u8() as Word));
+    let cy = cx.wrapping_add(c3);
+    z[n] = cy;
+    c = ConstChoice::from_word_lt(cx.0, c2.0).or(ConstChoice::from_word_lt(cy.0, c3.0));
+
+    for i in 1..n {
+        let c2 = add_mul_vvw(&mut z[i..n + i], x, Limb::ZERO);
+        let t = z[i].wrapping_mul(k);
+        let c3 = add_mul_vvw(&mut z[i..n + i], m, t);
+        let cx = c2.wrapping_add(Limb(c.to_u8() as Word));
+        let cy = cx.wrapping_add(c3);
+        z[n + i] = cy;
+        c = ConstChoice::from_word_lt(cx.0, c2.0).or(ConstChoice::from_word_lt(cy.0, c3.0));
+    }
+
+    let (lower, upper) = z.split_at_mut(n);
+    sub_vv(lower, upper, m);
+
+    let is_zero = c.not();
+    for (a, b) in lower.iter_mut().zip(upper.iter()) {
+        *a = Limb::select(*a, *b, is_zero);
+    }
+}
+
 #[inline]
 fn add_mul_vvw(z: &mut [Limb], x: &[Limb], y: Limb) -> Limb {
     let mut c = Limb::ZERO;
diff --git a/src/modular/reduction.rs b/src/modular/reduction.rs
index fc62a317..18f9b52b 100644
--- a/src/modular/reduction.rs
+++ b/src/modular/reduction.rs
@@ -2,9 +2,6 @@
 
 use crate::{Limb, Odd, Uint};
 
-#[cfg(feature = "alloc")]
-use crate::BoxedUint;
-
 /// Algorithm 14.32 in Handbook of Applied Cryptography <https://cacr.uwaterloo.ca/hac/about/chap14.pdf>
 #[inline(always)]
 const fn montgomery_reduction_inner(
@@ -67,41 +64,3 @@ pub const fn montgomery_reduction<const LIMBS: usize>(
     // so `meta_carry` is either 0 or 1)
     upper.sub_mod_with_carry(meta_carry, &modulus.0, &modulus.0)
 }
-
-/// Algorithm 14.32 in Handbook of Applied Cryptography <https://cacr.uwaterloo.ca/hac/about/chap14.pdf>
-///
-/// This version writes the result into the provided [`BoxedUint`].
-#[cfg(feature = "alloc")]
-pub(crate) fn montgomery_reduction_boxed_mut(
-    x: &mut BoxedUint,
-    modulus: &BoxedUint,
-    mod_neg_inv: Limb,
-    out: &mut BoxedUint,
-) {
-    debug_assert_eq!(x.nlimbs(), modulus.nlimbs() * 2);
-    debug_assert_eq!(out.nlimbs(), modulus.nlimbs());
-
-    let (lower, upper) = x.limbs.split_at_mut(modulus.nlimbs());
-    let meta_carry = montgomery_reduction_inner(upper, lower, &modulus.limbs, mod_neg_inv);
-
-    // Division is simply taking the upper half of the limbs
-    // Final reduction (at this point, the value is at most 2 * modulus,
-    // so `meta_carry` is either 0 or 1)
-    out.limbs.copy_from_slice(upper);
-    out.sub_assign_mod_with_carry(meta_carry, modulus, modulus);
-}
-
-/// Algorithm 14.32 in Handbook of Applied Cryptography <https://cacr.uwaterloo.ca/hac/about/chap14.pdf>
-///
-/// This version allocates and returns a [`BoxedUint`].
-#[cfg(feature = "alloc")]
-#[inline]
-pub(crate) fn montgomery_reduction_boxed(
-    x: &mut BoxedUint,
-    modulus: &BoxedUint,
-    mod_neg_inv: Limb,
-) -> BoxedUint {
-    let mut ret = BoxedUint::zero_with_precision(modulus.bits_precision());
-    montgomery_reduction_boxed_mut(x, modulus, mod_neg_inv, &mut ret);
-    ret
-}

From 6529b75ec5f11e7ba7d790239d7c7b298bb714b2 Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Wed, 5 Mar 2025 10:28:44 -0800
Subject: [PATCH 10/12] Make AMM functions const

---
 src/modular/boxed_monty_form/mul.rs | 91 ++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 35 deletions(-)

diff --git a/src/modular/boxed_monty_form/mul.rs b/src/modular/boxed_monty_form/mul.rs
index cd111498..2680d83b 100644
--- a/src/modular/boxed_monty_form/mul.rs
+++ b/src/modular/boxed_monty_form/mul.rs
@@ -6,12 +6,11 @@
 //! Originally (c) 2014 The Rust Project Developers, dual licensed Apache 2.0+MIT.
 
 use super::{BoxedMontyForm, BoxedMontyParams};
-use crate::{BoxedUint, ConstChoice, Limb, Square, SquareAssign, Word, Zero};
+use crate::{BoxedUint, ConstChoice, Limb, Square, SquareAssign, Word};
 use core::{
     borrow::Borrow,
     ops::{Mul, MulAssign},
 };
-use subtle::{ConditionallySelectable, ConstantTimeLess};
 
 #[cfg(feature = "zeroize")]
 use zeroize::Zeroize;
@@ -264,54 +263,55 @@ impl Drop for MontyMultiplier<'_> {
 ///
 /// Note: this was adapted from an implementation in `num-bigint`'s `monty.rs`.
 // TODO(tarcieri): refactor into `reduction.rs`, share impl with `MontyForm`?
-fn almost_montgomery_mul(z: &mut [Limb], x: &[Limb], y: &[Limb], m: &[Limb], k: Limb) {
+const fn almost_montgomery_mul(z: &mut [Limb], x: &[Limb], y: &[Limb], m: &[Limb], k: Limb) {
     // This code assumes x, y, m are all the same length (required by addMulVVW and the for loop).
     // It also assumes that x, y are already reduced mod m, or else the result will not be properly
     // reduced.
     let n = m.len();
 
     // This preconditions check allows compiler to remove bound checks later in the code.
-    // `z.len() > n && z[n..].len() == n` is used intentionally instead of `z.len() == 2* n`
-    // since the latter prevents compiler from removing some bound checks.
-    let pre_cond = z.len() > n && z[n..].len() == n && x.len() == n && y.len() == n && m.len() == n;
+    let pre_cond = z.len() > n && x.len() == n && y.len() == n && m.len() == n;
     if !pre_cond {
         panic!("Failed preconditions in montgomery_mul");
     }
 
-    let mut c = Limb::ZERO;
+    let mut c = ConstChoice::FALSE;
 
-    for i in 0..n {
-        let c2 = add_mul_vvw(&mut z[i..n + i], x, y[i]);
-        let t = z[i].wrapping_mul(k);
-        let c3 = add_mul_vvw(&mut z[i..n + i], m, t);
-        let cx = c.wrapping_add(c2);
+    let mut i = 0;
+    while i < n {
+        let (_, z_slice) = z.split_at_mut(i);
+        let c2 = add_mul_vvw(z_slice, x, y[i]);
+        let t = z_slice[0].wrapping_mul(k);
+        let c3 = add_mul_vvw(z_slice, m, t);
+        let cx = c2.wrapping_add(Limb(c.to_u8() as Word));
         let cy = cx.wrapping_add(c3);
         z[n + i] = cy;
-        c = Limb((cx.ct_lt(&c2) | cy.ct_lt(&c3)).unwrap_u8() as Word);
+        c = ConstChoice::from_word_lt(cx.0, c2.0).or(ConstChoice::from_word_lt(cy.0, c3.0));
+        i += 1;
     }
 
     let (lower, upper) = z.split_at_mut(n);
     sub_vv(lower, upper, m);
 
-    let is_zero = c.is_zero();
-    for (a, b) in lower.iter_mut().zip(upper.iter()) {
-        a.conditional_assign(b, is_zero);
+    let is_zero = c.not();
+    let mut i = 0;
+    while i < n {
+        lower[i] = Limb::select(lower[i], upper[i], is_zero);
+        i += 1;
     }
 }
 
 /// Same as `almost_montgomery_mul` with `y == 1`.
 ///
 /// Used for retrieving from Montgomery form.
-fn almost_montgomery_mul_by_one(z: &mut [Limb], x: &[Limb], m: &[Limb], k: Limb) {
+const fn almost_montgomery_mul_by_one(z: &mut [Limb], x: &[Limb], m: &[Limb], k: Limb) {
     // This code assumes x, m are all the same length (required by addMulVVW and the for loop).
     // It also assumes that x is already reduced mod m, or else the result will not be properly
     // reduced.
     let n = m.len();
 
     // This preconditions check allows compiler to remove bound checks later in the code.
-    // `z.len() > n && z[n..].len() == n` is used intentionally instead of `z.len() == 2* n`
-    // since the latter prevents compiler from removing some bound checks.
-    let pre_cond = z.len() > n && z[n..].len() == n && x.len() == n && m.len() == n;
+    let pre_cond = z.len() > n && x.len() == n && m.len() == n;
     if !pre_cond {
         panic!("Failed preconditions in montgomery_mul");
     }
@@ -319,53 +319,74 @@ fn almost_montgomery_mul_by_one(z: &mut [Limb], x: &[Limb], m: &[Limb], k: Limb)
     let mut c = ConstChoice::FALSE;
 
     // The unrolled first iteration.
-    let c2 = add_mul_vvw(&mut z[0..n], x, Limb::ONE);
+    let c2 = add_mul_vvw(z, x, Limb::ONE);
     let t = z[0].wrapping_mul(k);
-    let c3 = add_mul_vvw(&mut z[0..n], m, t);
+    let c3 = add_mul_vvw(z, m, t);
     let cx = c2.wrapping_add(Limb(c.to_u8() as Word));
     let cy = cx.wrapping_add(c3);
     z[n] = cy;
     c = ConstChoice::from_word_lt(cx.0, c2.0).or(ConstChoice::from_word_lt(cy.0, c3.0));
 
-    for i in 1..n {
-        let c2 = add_mul_vvw(&mut z[i..n + i], x, Limb::ZERO);
-        let t = z[i].wrapping_mul(k);
-        let c3 = add_mul_vvw(&mut z[i..n + i], m, t);
+    let mut i = 1;
+    while i < n {
+        let (_, z_slice) = z.split_at_mut(i);
+        let c2 = add_mul_vvw(z_slice, x, Limb::ZERO);
+        let t = z_slice[0].wrapping_mul(k);
+        let c3 = add_mul_vvw(z_slice, m, t);
         let cx = c2.wrapping_add(Limb(c.to_u8() as Word));
         let cy = cx.wrapping_add(c3);
         z[n + i] = cy;
         c = ConstChoice::from_word_lt(cx.0, c2.0).or(ConstChoice::from_word_lt(cy.0, c3.0));
+        i += 1;
     }
 
     let (lower, upper) = z.split_at_mut(n);
     sub_vv(lower, upper, m);
 
     let is_zero = c.not();
-    for (a, b) in lower.iter_mut().zip(upper.iter()) {
-        *a = Limb::select(*a, *b, is_zero);
+    let mut i = 0;
+    while i < n {
+        lower[i] = Limb::select(lower[i], upper[i], is_zero);
+        i += 1;
     }
 }
 
 #[inline]
-fn add_mul_vvw(z: &mut [Limb], x: &[Limb], y: Limb) -> Limb {
+const fn add_mul_vvw(z: &mut [Limb], x: &[Limb], y: Limb) -> Limb {
+    let n = x.len();
+    if n > z.len() {
+        panic!("Failed preconditions in montgomery_mul");
+    }
+
     let mut c = Limb::ZERO;
-    for (zi, xi) in z.iter_mut().zip(x.iter()) {
-        let (z0, z1) = zi.mac(*xi, y, Limb::ZERO);
+
+    let mut i = 0;
+    while i < n {
+        let (z0, z1) = z[i].mac(x[i], y, Limb::ZERO);
         let (zi_, c_) = z0.overflowing_add(c);
-        *zi = zi_;
+        z[i] = zi_;
         c = c_.wrapping_add(z1);
+        i += 1;
     }
 
     c
 }
 
 #[inline(always)]
-fn sub_vv(z: &mut [Limb], x: &[Limb], y: &[Limb]) {
+const fn sub_vv(z: &mut [Limb], x: &[Limb], y: &[Limb]) {
+    let n = z.len();
+    if !(n == x.len() && n == y.len()) {
+        panic!("Failed preconditions in montgomery_mul");
+    }
+
     let mut borrow = Limb::ZERO;
-    for (i, (&xi, &yi)) in x.iter().zip(y.iter()).enumerate().take(z.len()) {
-        let (zi, new_borrow) = xi.sbb(yi, borrow);
+
+    let mut i = 0;
+    while i < n {
+        let (zi, new_borrow) = x[i].sbb(y[i], borrow);
         z[i] = zi;
         borrow = new_borrow;
+        i += 1;
     }
 }
 

From a3d464c8d15e3e5f1e6cb7454cf4b8404224722e Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Wed, 5 Mar 2025 12:43:56 -0800
Subject: [PATCH 11/12] Use canonical CIOS with N-sized buffer, expand
 comments.

---
 src/modular/boxed_monty_form/mul.rs | 211 +++++++++++++++-------------
 src/modular/boxed_monty_form/pow.rs |  19 ++-
 src/primitives.rs                   |  12 +-
 3 files changed, 138 insertions(+), 104 deletions(-)

diff --git a/src/modular/boxed_monty_form/mul.rs b/src/modular/boxed_monty_form/mul.rs
index 2680d83b..e6bd183b 100644
--- a/src/modular/boxed_monty_form/mul.rs
+++ b/src/modular/boxed_monty_form/mul.rs
@@ -6,7 +6,7 @@
 //! Originally (c) 2014 The Rust Project Developers, dual licensed Apache 2.0+MIT.
 
 use super::{BoxedMontyForm, BoxedMontyParams};
-use crate::{BoxedUint, ConstChoice, Limb, Square, SquareAssign, Word};
+use crate::{BoxedUint, ConstChoice, Limb, Square, SquareAssign};
 use core::{
     borrow::Borrow,
     ops::{Mul, MulAssign},
@@ -113,7 +113,7 @@ impl<'a> MontyMultiplier<'a> {
     /// Create a new Montgomery multiplier.
     pub(super) fn new(modulus: &'a BoxedUint, mod_neg_inv: Limb) -> Self {
         Self {
-            product: BoxedUint::zero_with_precision(modulus.bits_precision() * 2),
+            product: BoxedUint::zero_with_precision(modulus.bits_precision()),
             modulus,
             mod_neg_inv,
         }
@@ -130,7 +130,6 @@ impl<'a> MontyMultiplier<'a> {
     pub(super) fn mul_assign(&mut self, a: &mut BoxedUint, b: &BoxedUint) {
         self.mul_amm_assign(a, b);
         a.sub_assign_mod_with_carry(Limb::ZERO, self.modulus, self.modulus);
-
         debug_assert!(&*a < self.modulus);
     }
 
@@ -147,9 +146,9 @@ impl<'a> MontyMultiplier<'a> {
             self.modulus.as_limbs(),
             self.mod_neg_inv,
         );
-        ret.limbs
-            .copy_from_slice(&self.product.limbs[..a.limbs.len()]);
-        ret.sub_assign_mod_with_carry(Limb::ZERO, self.modulus, self.modulus);
+        ret.limbs.copy_from_slice(&self.product.limbs);
+
+        // Note: no reduction is required, see the doc comment of `almost_montgomery_mul()`.
 
         ret
     }
@@ -165,7 +164,6 @@ impl<'a> MontyMultiplier<'a> {
     pub(super) fn square_assign(&mut self, a: &mut BoxedUint) {
         self.square_amm_assign(a);
         a.sub_assign_mod_with_carry(Limb::ZERO, self.modulus, self.modulus);
-
         debug_assert!(&*a < self.modulus);
     }
 
@@ -197,8 +195,7 @@ impl<'a> MontyMultiplier<'a> {
             self.modulus.as_limbs(),
             self.mod_neg_inv,
         );
-        a.limbs
-            .copy_from_slice(&self.product.limbs[..a.limbs.len()]);
+        a.limbs.copy_from_slice(&self.product.limbs);
     }
 
     /// Perform a squaring using "Almost Montgomery Multiplication".
@@ -230,8 +227,7 @@ impl<'a> MontyMultiplier<'a> {
             self.modulus.as_limbs(),
             self.mod_neg_inv,
         );
-        a.limbs
-            .copy_from_slice(&self.product.limbs[..a.limbs.len()]);
+        a.limbs.copy_from_slice(&self.product.limbs);
     }
 
     /// Clear the internal product buffer.
@@ -250,140 +246,163 @@ impl Drop for MontyMultiplier<'_> {
     }
 }
 
-/// Compute an "Almost Montgomery Multiplication (AMM)" as described in the paper
-/// "Efficient Software Implementations of Modular Exponentiation"
-/// <https://eprint.iacr.org/2011/239.pdf>
-///
-/// Computes z mod m = x * y * 2 ** (-n*_W) mod m assuming k = -1/m mod 2**_W.
-///
-/// x and y are required to satisfy 0 <= z < 2**(n*_W) and then the result z is guaranteed to
-/// satisfy 0 <= z < 2**(n*_W), but it may not be < m.
-///
-/// Output is written into the lower (i.e. first) half of `z`.
-///
-/// Note: this was adapted from an implementation in `num-bigint`'s `monty.rs`.
+/**
+Computes Montgomery multiplication of `x` and `y` into `z`, that is
+`z mod m = x * y * 2^(-n*W) mod m` assuming `k = -1/m mod 2^W`,
+where `W` is the bit size of the limb, and `n * W` is the full bit size of the integer.
+
+NOTE: `z` is assumed to be pre-zeroized.
+
+This function implements the Coarsely Integrated Operand Scanning (CIOS) variation
+of Montgomery multiplication, using the classification from
+"Analyzing and Comparing Montgomery Multiplication Algorithms" by Koc et al
+(<https://www.microsoft.com/en-us/research/wp-content/uploads/1996/01/j37acmon.pdf>).
+
+Additionally, unlike in Koc et al, we are reducing the final result only if it overflows
+`2^(n*W)`, not when it overflows `m`.
+This means that this function does not assume `x` and `y` are reduced `mod m`,
+and the result will be correct `mod m`, but potentially greater than `m`,
+and smaller than `2^(n * W) + m`.
+See "Efficient Software Implementations of Modular Exponentiation" by S. Gueron for details
+(<https://eprint.iacr.org/2011/239.pdf>).
+
+This function exhibits certain properties which were discovered via randomized tests,
+but (to my knowledge at this moment) have not been proven formally.
+Hereinafter I denote `f(x) = floor(x / m)`, that is `f` is the number of subtractions
+of the modulus required to fully reduce `x`.
+
+1. In general, if `f(x) = k` and `f(y) = n`, then `f(AMM(x, y)) <= min(k, n) + 1`.
+   That is the "reduction error" grows with every operation,
+   but is determined by the argument with the lower error.
+2. To retrieve the number from Montgomery form we MM it by 1. In this case `f(AMM(x, 1)) = 0`,
+   that is the result is always fully reduced regardless of `f(x)`.
+3. `f(AMM(x, x)) <= 1` regardless of `f(x)`. That is, squaring resets the error to at most 1.
+*/
 // TODO(tarcieri): refactor into `reduction.rs`, share impl with `MontyForm`?
-const fn almost_montgomery_mul(z: &mut [Limb], x: &[Limb], y: &[Limb], m: &[Limb], k: Limb) {
-    // This code assumes x, y, m are all the same length (required by addMulVVW and the for loop).
-    // It also assumes that x, y are already reduced mod m, or else the result will not be properly
-    // reduced.
-    let n = m.len();
+pub(crate) const fn almost_montgomery_mul(
+    z: &mut [Limb],
+    x: &[Limb],
+    y: &[Limb],
+    m: &[Limb],
+    k: Limb,
+) {
+    let n = z.len();
 
     // This preconditions check allows compiler to remove bound checks later in the code.
-    let pre_cond = z.len() > n && x.len() == n && y.len() == n && m.len() == n;
-    if !pre_cond {
-        panic!("Failed preconditions in montgomery_mul");
+    if !(x.len() == n && y.len() == n && m.len() == n) {
+        panic!("Failed preconditions in `almost_montgomery_mul`");
     }
 
-    let mut c = ConstChoice::FALSE;
+    let mut ts = Limb::ZERO;
 
     let mut i = 0;
     while i < n {
-        let (_, z_slice) = z.split_at_mut(i);
-        let c2 = add_mul_vvw(z_slice, x, y[i]);
-        let t = z_slice[0].wrapping_mul(k);
-        let c3 = add_mul_vvw(z_slice, m, t);
-        let cx = c2.wrapping_add(Limb(c.to_u8() as Word));
-        let cy = cx.wrapping_add(c3);
-        z[n + i] = cy;
-        c = ConstChoice::from_word_lt(cx.0, c2.0).or(ConstChoice::from_word_lt(cy.0, c3.0));
-        i += 1;
-    }
+        let mut c = add_mul_carry(z, x, y[i]);
+        (ts, c) = ts.overflowing_add(c);
+        let ts1 = c;
 
-    let (lower, upper) = z.split_at_mut(n);
-    sub_vv(lower, upper, m);
+        let t = z[0].wrapping_mul(k);
+
+        c = add_mul_carry_and_shift(z, m, t);
+        (z[n - 1], c) = ts.overflowing_add(c);
+        ts = ts1.wrapping_add(c);
 
-    let is_zero = c.not();
-    let mut i = 0;
-    while i < n {
-        lower[i] = Limb::select(lower[i], upper[i], is_zero);
         i += 1;
     }
+
+    // If the result overflows the integer size, subtract the modulus.
+    let overflow = ConstChoice::from_word_lsb(ts.0);
+    conditional_sub(z, m, overflow);
 }
 
-/// Same as `almost_montgomery_mul` with `y == 1`.
+/// Same as `almost_montgomery_mul()` with `y == 1`.
 ///
 /// Used for retrieving from Montgomery form.
-const fn almost_montgomery_mul_by_one(z: &mut [Limb], x: &[Limb], m: &[Limb], k: Limb) {
-    // This code assumes x, m are all the same length (required by addMulVVW and the for loop).
-    // It also assumes that x is already reduced mod m, or else the result will not be properly
-    // reduced.
-    let n = m.len();
+pub(crate) const fn almost_montgomery_mul_by_one(z: &mut [Limb], x: &[Limb], m: &[Limb], k: Limb) {
+    let n = z.len();
 
     // This preconditions check allows compiler to remove bound checks later in the code.
-    let pre_cond = z.len() > n && x.len() == n && m.len() == n;
-    if !pre_cond {
-        panic!("Failed preconditions in montgomery_mul");
+    if !(x.len() == n && m.len() == n) {
+        panic!("Failed preconditions in `almost_montgomery_mul_by_one`");
     }
 
-    let mut c = ConstChoice::FALSE;
+    let mut ts = Limb::ZERO;
 
-    // The unrolled first iteration.
-    let c2 = add_mul_vvw(z, x, Limb::ONE);
-    let t = z[0].wrapping_mul(k);
-    let c3 = add_mul_vvw(z, m, t);
-    let cx = c2.wrapping_add(Limb(c.to_u8() as Word));
-    let cy = cx.wrapping_add(c3);
-    z[n] = cy;
-    c = ConstChoice::from_word_lt(cx.0, c2.0).or(ConstChoice::from_word_lt(cy.0, c3.0));
-
-    let mut i = 1;
+    let mut i = 0;
     while i < n {
-        let (_, z_slice) = z.split_at_mut(i);
-        let c2 = add_mul_vvw(z_slice, x, Limb::ZERO);
-        let t = z_slice[0].wrapping_mul(k);
-        let c3 = add_mul_vvw(z_slice, m, t);
-        let cx = c2.wrapping_add(Limb(c.to_u8() as Word));
-        let cy = cx.wrapping_add(c3);
-        z[n + i] = cy;
-        c = ConstChoice::from_word_lt(cx.0, c2.0).or(ConstChoice::from_word_lt(cy.0, c3.0));
+        let mut c = if i == 0 {
+            add_mul_carry(z, x, Limb::ONE)
+        } else {
+            Limb::ZERO
+        };
+        (ts, c) = ts.overflowing_add(c);
+        let ts1 = c;
+
+        let t = z[0].wrapping_mul(k);
+
+        c = add_mul_carry_and_shift(z, m, t);
+        (z[n - 1], c) = ts.overflowing_add(c);
+        ts = ts1.wrapping_add(c);
+
         i += 1;
     }
 
-    let (lower, upper) = z.split_at_mut(n);
-    sub_vv(lower, upper, m);
+    // If the result overflows the integer size, subtract the modulus.
+    let overflow = ConstChoice::from_word_lsb(ts.0);
+    conditional_sub(z, m, overflow);
+}
+
+/// Calcaultes `z += x * y` and returns the carry.
+#[inline]
+const fn add_mul_carry(z: &mut [Limb], x: &[Limb], y: Limb) -> Limb {
+    let n = z.len();
+    if n != x.len() {
+        panic!("Failed preconditions in `add_mul_carry`");
+    }
 
-    let is_zero = c.not();
+    let mut c = Limb::ZERO;
     let mut i = 0;
     while i < n {
-        lower[i] = Limb::select(lower[i], upper[i], is_zero);
+        (z[i], c) = z[i].mac(x[i], y, c);
         i += 1;
     }
+    c
 }
 
+/// Calcaultes `z = (z + x * y) / 2^W` and returns the carry (of the `z + x * y`).
 #[inline]
-const fn add_mul_vvw(z: &mut [Limb], x: &[Limb], y: Limb) -> Limb {
-    let n = x.len();
-    if n > z.len() {
-        panic!("Failed preconditions in montgomery_mul");
+const fn add_mul_carry_and_shift(z: &mut [Limb], x: &[Limb], y: Limb) -> Limb {
+    let n = z.len();
+    if n != x.len() {
+        panic!("Failed preconditions in `add_mul_carry_and_shift`");
     }
 
-    let mut c = Limb::ZERO;
+    let (_, mut c) = z[0].mac(x[0], y, Limb::ZERO);
 
-    let mut i = 0;
-    while i < n {
-        let (z0, z1) = z[i].mac(x[i], y, Limb::ZERO);
-        let (zi_, c_) = z0.overflowing_add(c);
-        z[i] = zi_;
-        c = c_.wrapping_add(z1);
+    let mut i = 1;
+    let mut i1 = 0;
+    // Help the compiler elide bound checking
+    while i < n && i1 < n {
+        (z[i1], c) = z[i].mac(x[i], y, c);
         i += 1;
+        i1 += 1;
     }
 
     c
 }
 
+/// Calculates `z -= x` if `c` is truthy, otherwise `z` is unchanged.
 #[inline(always)]
-const fn sub_vv(z: &mut [Limb], x: &[Limb], y: &[Limb]) {
+const fn conditional_sub(z: &mut [Limb], x: &[Limb], c: ConstChoice) {
     let n = z.len();
-    if !(n == x.len() && n == y.len()) {
-        panic!("Failed preconditions in montgomery_mul");
+    if n != x.len() {
+        panic!("Failed preconditions in `conditional_sub`");
     }
 
     let mut borrow = Limb::ZERO;
-
     let mut i = 0;
     while i < n {
-        let (zi, new_borrow) = x[i].sbb(y[i], borrow);
+        let (zi, new_borrow) = z[i].sbb(Limb(c.if_true_word(x[i].0)), borrow);
         z[i] = zi;
         borrow = new_borrow;
         i += 1;
diff --git a/src/modular/boxed_monty_form/pow.rs b/src/modular/boxed_monty_form/pow.rs
index 72f95e68..9a2c70a3 100644
--- a/src/modular/boxed_monty_form/pow.rs
+++ b/src/modular/boxed_monty_form/pow.rs
@@ -111,12 +111,21 @@ fn pow_montgomery_form(
         }
     }
 
-    // Ensure output is properly reduced: AMM only reduces to the bit length of `modulus`
-    // See RustCrypto/crypto-bigint#441
-    z.conditional_sbb_assign(modulus, !z.ct_lt(modulus));
+    // Ensure the output is properly reduced.
+    //
+    // Using the properties of `almost_mongtomery_mul()` (see its documentation):
+    // - We have an incoming `x` which is fully reduced (`floor(x / modulus) = 0`).
+    // - We build an array of `powers` which are produced by multiplying the previous power by `x`,
+    //   so for each power `floor(power / modulus) <= 1`.
+    // - Then we take turns squaring the accumulator `z` (bringing `floor(z / modulus)` to 1
+    //   regardless of the previous reduction level) and multiplying by a power of `x`
+    //   (bringing `floor(z / modulus)` to at most 2).
+    // - Then we either exit the loop, or square again, which brings `floor(z / modulus)` back to 1.
+    //
+    // Now that we exited the loop, we need to reduce `z` at most twice
+    // to bring it within `[0, modulus)`.
 
-    // Subtract again to ensure output is fully reduced
-    // See RustCrypto/crypto-bigint#455 and golang.org/issue/13907
+    z.conditional_sbb_assign(modulus, !z.ct_lt(modulus));
     z.conditional_sbb_assign(modulus, !z.ct_lt(modulus));
     debug_assert!(&z < modulus);
 
diff --git a/src/primitives.rs b/src/primitives.rs
index 3a0ae58e..731cf55f 100644
--- a/src/primitives.rs
+++ b/src/primitives.rs
@@ -61,7 +61,13 @@ pub(crate) const fn mac(a: Word, b: Word, c: Word, carry: Word) -> (Word, Word)
     let a = a as WideWord;
     let b = b as WideWord;
     let c = c as WideWord;
-    let carry = carry as WideWord;
-    let ret = a + (b * c) + carry;
-    (ret as Word, (ret >> Word::BITS) as Word)
+    let ret = a + (b * c);
+    let (lo, hi) = (ret as Word, (ret >> Word::BITS) as Word);
+
+    let (lo, c) = lo.overflowing_add(carry);
+
+    // Even if all the arguments are `Word::MAX` we can't overflow `hi`.
+    let hi = hi.wrapping_add(c as Word);
+
+    (lo, hi)
 }

From 8dc0d4a88781ec0aed85c41418c27b64cef0d5cd Mon Sep 17 00:00:00 2001
From: Bogdan Opanchuk <bogdan@opanchuk.net>
Date: Wed, 5 Mar 2025 12:39:54 -0800
Subject: [PATCH 12/12] Expose MontyMultiplier

---
 src/modular/boxed_monty_form.rs     | 22 +++++++++++----
 src/modular/boxed_monty_form/mul.rs | 43 +++++++++++++++++++----------
 src/modular/boxed_monty_form/pow.rs |  4 +--
 src/modular/monty_form.rs           |  7 +++++
 src/modular/monty_form/mul.rs       | 38 +++++++++++++++++++++++--
 src/traits.rs                       | 25 +++++++++++++++++
 6 files changed, 116 insertions(+), 23 deletions(-)

diff --git a/src/modular/boxed_monty_form.rs b/src/modular/boxed_monty_form.rs
index 5a414c6c..1ee2abc0 100644
--- a/src/modular/boxed_monty_form.rs
+++ b/src/modular/boxed_monty_form.rs
@@ -9,7 +9,7 @@ mod pow;
 mod sub;
 
 use super::{ConstMontyParams, Retrieve, div_by_2};
-use mul::MontyMultiplier;
+use mul::BoxedMontyMultiplier;
 
 use crate::{BoxedUint, Limb, Monty, Odd, Word};
 use alloc::sync::Arc;
@@ -67,7 +67,7 @@ impl BoxedMontyParams {
         let mod_leading_zeros = modulus.as_ref().leading_zeros().min(Word::BITS - 1);
 
         let r3 = {
-            let mut mm = MontyMultiplier::new(&modulus, mod_neg_inv);
+            let mut mm = BoxedMontyMultiplier::new(&modulus, mod_neg_inv);
             mm.square(&r2)
         };
 
@@ -110,7 +110,7 @@ impl BoxedMontyParams {
         let mod_leading_zeros = modulus.as_ref().leading_zeros().min(Word::BITS - 1);
 
         let r3 = {
-            let mut mm = MontyMultiplier::new(&modulus, mod_neg_inv);
+            let mut mm = BoxedMontyMultiplier::new(&modulus, mod_neg_inv);
             mm.square(&r2)
         };
 
@@ -187,7 +187,7 @@ impl BoxedMontyForm {
 
     /// Retrieves the integer currently encoded in this [`BoxedMontyForm`], guaranteed to be reduced.
     pub fn retrieve(&self) -> BoxedUint {
-        let mut mm = MontyMultiplier::from(self.params.as_ref());
+        let mut mm = BoxedMontyMultiplier::from(self.params.as_ref());
         mm.mul_by_one(&self.montgomery_form)
     }
 
@@ -277,6 +277,7 @@ impl Retrieve for BoxedMontyForm {
 impl Monty for BoxedMontyForm {
     type Integer = BoxedUint;
     type Params = BoxedMontyParams;
+    type Multiplier<'a> = BoxedMontyMultiplier<'a>;
 
     fn new_params_vartime(modulus: Odd<Self::Integer>) -> Self::Params {
         BoxedMontyParams::new_vartime(modulus)
@@ -302,6 +303,17 @@ impl Monty for BoxedMontyForm {
         &self.montgomery_form
     }
 
+    fn copy_montgomery_from(&mut self, other: &Self) {
+        debug_assert_eq!(
+            self.montgomery_form.bits_precision(),
+            other.montgomery_form.bits_precision()
+        );
+        debug_assert_eq!(self.params, other.params);
+        self.montgomery_form
+            .limbs
+            .copy_from_slice(&other.montgomery_form.limbs);
+    }
+
     fn double(&self) -> Self {
         BoxedMontyForm::double(self)
     }
@@ -330,7 +342,7 @@ impl Zeroize for BoxedMontyForm {
 /// Convert the given integer into the Montgomery domain.
 #[inline]
 fn convert_to_montgomery(integer: &mut BoxedUint, params: &BoxedMontyParams) {
-    let mut mm = MontyMultiplier::from(params);
+    let mut mm = BoxedMontyMultiplier::from(params);
     mm.mul_assign(integer, &params.r2);
 }
 
diff --git a/src/modular/boxed_monty_form/mul.rs b/src/modular/boxed_monty_form/mul.rs
index e6bd183b..d87d1ffd 100644
--- a/src/modular/boxed_monty_form/mul.rs
+++ b/src/modular/boxed_monty_form/mul.rs
@@ -6,7 +6,7 @@
 //! Originally (c) 2014 The Rust Project Developers, dual licensed Apache 2.0+MIT.
 
 use super::{BoxedMontyForm, BoxedMontyParams};
-use crate::{BoxedUint, ConstChoice, Limb, Square, SquareAssign};
+use crate::{BoxedUint, ConstChoice, Limb, MontyMultiplier, Square, SquareAssign};
 use core::{
     borrow::Borrow,
     ops::{Mul, MulAssign},
@@ -19,7 +19,7 @@ impl BoxedMontyForm {
     /// Multiplies by `rhs`.
     pub fn mul(&self, rhs: &Self) -> Self {
         debug_assert_eq!(&self.params, &rhs.params);
-        let montgomery_form = MontyMultiplier::from(self.params.borrow())
+        let montgomery_form = BoxedMontyMultiplier::from(self.params.borrow())
             .mul(&self.montgomery_form, &rhs.montgomery_form);
 
         Self {
@@ -31,7 +31,7 @@ impl BoxedMontyForm {
     /// Computes the (reduced) square.
     pub fn square(&self) -> Self {
         let montgomery_form =
-            MontyMultiplier::from(self.params.borrow()).square(&self.montgomery_form);
+            BoxedMontyMultiplier::from(self.params.borrow()).square(&self.montgomery_form);
 
         Self {
             montgomery_form,
@@ -79,7 +79,7 @@ impl MulAssign<BoxedMontyForm> for BoxedMontyForm {
 impl MulAssign<&BoxedMontyForm> for BoxedMontyForm {
     fn mul_assign(&mut self, rhs: &BoxedMontyForm) {
         debug_assert_eq!(&self.params, &rhs.params);
-        MontyMultiplier::from(self.params.borrow())
+        BoxedMontyMultiplier::from(self.params.borrow())
             .mul_assign(&mut self.montgomery_form, &rhs.montgomery_form);
     }
 }
@@ -92,24 +92,39 @@ impl Square for BoxedMontyForm {
 
 impl SquareAssign for BoxedMontyForm {
     fn square_assign(&mut self) {
-        MontyMultiplier::from(self.params.borrow()).square_assign(&mut self.montgomery_form);
-    }
-}
-
-impl<'a> From<&'a BoxedMontyParams> for MontyMultiplier<'a> {
-    fn from(params: &'a BoxedMontyParams) -> MontyMultiplier<'a> {
-        MontyMultiplier::new(&params.modulus, params.mod_neg_inv)
+        BoxedMontyMultiplier::from(self.params.borrow()).square_assign(&mut self.montgomery_form);
     }
 }
 
 /// Montgomery multiplier with a pre-allocated internal buffer to avoid additional allocations.
-pub(super) struct MontyMultiplier<'a> {
+#[derive(Debug, Clone)]
+pub struct BoxedMontyMultiplier<'a> {
     product: BoxedUint,
     modulus: &'a BoxedUint,
     mod_neg_inv: Limb,
 }
 
-impl<'a> MontyMultiplier<'a> {
+impl<'a> From<&'a BoxedMontyParams> for BoxedMontyMultiplier<'a> {
+    fn from(params: &'a BoxedMontyParams) -> BoxedMontyMultiplier<'a> {
+        BoxedMontyMultiplier::new(&params.modulus, params.mod_neg_inv)
+    }
+}
+
+impl<'a> MontyMultiplier<'a> for BoxedMontyMultiplier<'a> {
+    type Monty = BoxedMontyForm;
+
+    /// Performs a Montgomery multiplication, assigning a fully reduced result to `lhs`.
+    fn mul_assign(&mut self, lhs: &mut Self::Monty, rhs: &Self::Monty) {
+        self.mul_assign(&mut lhs.montgomery_form, &rhs.montgomery_form);
+    }
+
+    /// Performs a Montgomery squaring, assigning a fully reduced result to `lhs`.
+    fn square_assign(&mut self, lhs: &mut Self::Monty) {
+        self.square_assign(&mut lhs.montgomery_form);
+    }
+}
+
+impl<'a> BoxedMontyMultiplier<'a> {
     /// Create a new Montgomery multiplier.
     pub(super) fn new(modulus: &'a BoxedUint, mod_neg_inv: Limb) -> Self {
         Self {
@@ -240,7 +255,7 @@ impl<'a> MontyMultiplier<'a> {
 }
 
 #[cfg(feature = "zeroize")]
-impl Drop for MontyMultiplier<'_> {
+impl Drop for BoxedMontyMultiplier<'_> {
     fn drop(&mut self) {
         self.product.zeroize();
     }
diff --git a/src/modular/boxed_monty_form/pow.rs b/src/modular/boxed_monty_form/pow.rs
index 9a2c70a3..93fd5d67 100644
--- a/src/modular/boxed_monty_form/pow.rs
+++ b/src/modular/boxed_monty_form/pow.rs
@@ -1,6 +1,6 @@
 //! Modular exponentiation support for [`BoxedMontyForm`].
 
-use super::{BoxedMontyForm, mul::MontyMultiplier};
+use super::{BoxedMontyForm, mul::BoxedMontyMultiplier};
 use crate::{BoxedUint, ConstantTimeSelect, Limb, PowBoundedExp, Word};
 use alloc::vec::Vec;
 use subtle::{ConstantTimeEq, ConstantTimeLess};
@@ -60,7 +60,7 @@ fn pow_montgomery_form(
     const WINDOW: u32 = 4;
     const WINDOW_MASK: Word = (1 << WINDOW) - 1;
 
-    let mut multiplier = MontyMultiplier::new(modulus, mod_neg_inv);
+    let mut multiplier = BoxedMontyMultiplier::new(modulus, mod_neg_inv);
 
     // powers[i] contains x^i
     let mut powers = Vec::with_capacity(1 << WINDOW);
diff --git a/src/modular/monty_form.rs b/src/modular/monty_form.rs
index 84910592..7e3f8905 100644
--- a/src/modular/monty_form.rs
+++ b/src/modular/monty_form.rs
@@ -15,6 +15,7 @@ use super::{
     reduction::montgomery_reduction,
 };
 use crate::{Concat, ConstChoice, Limb, Monty, NonZero, Odd, Split, Uint, Word};
+use mul::DynMontyMultiplier;
 use subtle::{Choice, ConditionallySelectable, ConstantTimeEq};
 
 /// Parameters to efficiently go to/from the Montgomery form for an odd modulus provided at runtime.
@@ -271,6 +272,7 @@ impl<const LIMBS: usize> Retrieve for MontyForm<LIMBS> {
 impl<const LIMBS: usize> Monty for MontyForm<LIMBS> {
     type Integer = Uint<LIMBS>;
     type Params = MontyParams<LIMBS>;
+    type Multiplier<'a> = DynMontyMultiplier<'a, LIMBS>;
 
     fn new_params_vartime(modulus: Odd<Self::Integer>) -> Self::Params {
         MontyParams::new_vartime(modulus)
@@ -296,6 +298,11 @@ impl<const LIMBS: usize> Monty for MontyForm<LIMBS> {
         &self.montgomery_form
     }
 
+    fn copy_montgomery_from(&mut self, other: &Self) {
+        debug_assert_eq!(self.params, other.params);
+        self.montgomery_form = other.montgomery_form;
+    }
+
     fn double(&self) -> Self {
         MontyForm::double(self)
     }
diff --git a/src/modular/monty_form/mul.rs b/src/modular/monty_form/mul.rs
index 11344ed2..710c719e 100644
--- a/src/modular/monty_form/mul.rs
+++ b/src/modular/monty_form/mul.rs
@@ -2,8 +2,11 @@
 
 use super::MontyForm;
 use crate::{
-    Square, SquareAssign,
-    modular::mul::{mul_montgomery_form, square_montgomery_form},
+    MontyMultiplier, Square, SquareAssign,
+    modular::{
+        MontyParams,
+        mul::{mul_montgomery_form, square_montgomery_form},
+    },
 };
 use core::ops::{Mul, MulAssign};
 
@@ -88,3 +91,34 @@ impl<const LIMBS: usize> SquareAssign for MontyForm<LIMBS> {
         *self = self.square()
     }
 }
+
+#[derive(Debug, Clone, Copy)]
+pub struct DynMontyMultiplier<'a, const LIMBS: usize>(&'a MontyParams<LIMBS>);
+
+impl<'a, const LIMBS: usize> From<&'a MontyParams<LIMBS>> for DynMontyMultiplier<'a, LIMBS> {
+    fn from(source: &'a MontyParams<LIMBS>) -> Self {
+        Self(source)
+    }
+}
+
+impl<'a, const LIMBS: usize> MontyMultiplier<'a> for DynMontyMultiplier<'a, LIMBS> {
+    type Monty = MontyForm<LIMBS>;
+
+    /// Performs a Montgomery multiplication, assigning a fully reduced result to `lhs`.
+    fn mul_assign(&mut self, lhs: &mut Self::Monty, rhs: &Self::Monty) {
+        let product = mul_montgomery_form(
+            &lhs.montgomery_form,
+            &rhs.montgomery_form,
+            &self.0.modulus,
+            self.0.mod_neg_inv,
+        );
+        lhs.montgomery_form = product;
+    }
+
+    /// Performs a Montgomery squaring, assigning a fully reduced result to `lhs`.
+    fn square_assign(&mut self, lhs: &mut Self::Monty) {
+        let product =
+            square_montgomery_form(&lhs.montgomery_form, &self.0.modulus, self.0.mod_neg_inv);
+        lhs.montgomery_form = product;
+    }
+}
diff --git a/src/traits.rs b/src/traits.rs
index 2c60c544..949069fa 100644
--- a/src/traits.rs
+++ b/src/traits.rs
@@ -870,6 +870,9 @@ pub trait Monty:
     /// The original integer type.
     type Integer: Integer<Monty = Self>;
 
+    /// Prepared Montgomery multiplier for tight loops.
+    type Multiplier<'a>: Debug + Clone + MontyMultiplier<'a, Monty = Self>;
+
     /// The precomputed data needed for this representation.
     type Params: 'static + Clone + Debug + Eq + Sized + Send + Sync;
 
@@ -892,6 +895,10 @@ pub trait Monty:
     /// Access the value in Montgomery form.
     fn as_montgomery(&self) -> &Self::Integer;
 
+    /// Copy the Montgomery representation from `other` into `self`.
+    /// NOTE: the parameters remain unchanged.
+    fn copy_montgomery_from(&mut self, other: &Self);
+
     /// Performs doubling, returning `self + self`.
     fn double(&self) -> Self;
 
@@ -913,3 +920,21 @@ pub trait Monty:
     /// Montgomery parameters.
     fn lincomb_vartime(products: &[(&Self, &Self)]) -> Self;
 }
+
+/// Prepared Montgomery multiplier for tight loops.
+///
+/// Allows one to perform inplace multiplication without allocations
+/// (important for the `BoxedUint` case).
+///
+/// NOTE: You will be operating with Montgomery represntations directly,
+/// make sure they all correspond to the same set of parameters.
+pub trait MontyMultiplier<'a>: From<&'a <Self::Monty as Monty>::Params> {
+    /// The associated Montgomery-representation integer.
+    type Monty: Monty;
+
+    /// Performs a Montgomery multiplication, assigning a fully reduced result to `lhs`.
+    fn mul_assign(&mut self, lhs: &mut Self::Monty, rhs: &Self::Monty);
+
+    /// Performs a Montgomery squaring, assigning a fully reduced result to `lhs`.
+    fn square_assign(&mut self, lhs: &mut Self::Monty);
+}