10#ifndef RANLUXPP_MULMOD_H
11#define RANLUXPP_MULMOD_H
22static void multiply9x9(
const uint64_t *in1,
const uint64_t *in2,
25 unsigned nextCarry = 0;
27#if defined(__clang__) || defined(__INTEL_COMPILER)
29#elif defined(__GNUC__) && __GNUC__ >= 8
33 for (
int i = 0; i < 18; i++) {
34 uint64_t current = next;
35 unsigned carry = nextCarry;
40#if defined(__clang__) || defined(__INTEL_COMPILER)
42#elif defined(__GNUC__) && __GNUC__ >= 8
46 for (
int j = 0; j < 9; j++) {
51 uint64_t fac1 = in1[j];
52 uint64_t fac2 = in2[k];
53#if defined(__SIZEOF_INT128__) && !defined(CLHEP_NO_INT128)
55#pragma GCC diagnostic push
56#pragma GCC diagnostic ignored "-Wpedantic"
58 unsigned __int128 prod = fac1;
61 uint64_t upper = prod >> 64;
62 uint64_t lower =
static_cast<uint64_t
>(prod);
64#pragma GCC diagnostic pop
67 uint64_t upper1 = fac1 >> 32;
68 uint64_t lower1 =
static_cast<uint32_t
>(fac1);
70 uint64_t upper2 = fac2 >> 32;
71 uint64_t lower2 =
static_cast<uint32_t
>(fac2);
75 uint64_t upper = upper1 * upper2;
76 uint64_t middle1 = upper1 * lower2;
77 uint64_t middle2 = lower1 * upper2;
78 uint64_t lower = lower1 * lower2;
83 uint64_t middle = add_overflow(middle1, middle2, overflow);
94 uint64_t overflow_add = overflow * (uint64_t(1) << 32);
99 upper += overflow_add;
101 uint64_t middle_upper = middle >> 32;
102 uint64_t middle_lower = middle << 32;
104 lower = add_overflow(lower, middle_lower, overflow);
119 upper += middle_upper;
123 current = add_carry(current, lower, carry);
126 next = add_carry(next, upper, nextCarry);
129 next = add_carry(next, carry, nextCarry);
143static void mod_m(
const uint64_t *mul, uint64_t *out) {
146 for (
int i = 0; i < 9; i++) {
150 int64_t c = compute_r(mul + 9, r);
174 uint64_t c_unsigned =
static_cast<uint64_t
>(c);
177 int64_t t2 = t0 - (c_unsigned << 48);
181 int64_t t1 = t2 >> 48;
187 uint64_t out_0 = sub_carry(r_0, c, carry);
190 for (
int i = 1; i < 3; i++) {
192 r_i = sub_overflow(r_i, carry, carry);
194 uint64_t out_i = sub_carry(r_i, t0, carry);
199 r_3 = sub_overflow(r_3, carry, carry);
201 uint64_t out_3 = sub_carry(r_3, t2, carry);
204 for (
int i = 4; i < 9; i++) {
206 r_i = sub_overflow(r_i, carry, carry);
208 uint64_t out_i = sub_carry(r_i, t1, carry);
219static void mulmod(
const uint64_t *in1, uint64_t *inout) {
220 uint64_t mul[2 * 9] = {0};
221 multiply9x9(in1, inout, mul);
232static void powermod(
const uint64_t *base, uint64_t *res, uint64_t n) {
233 uint64_t fac[9] = {0};
236 for (
int i = 1; i < 9; i++) {
241 uint64_t mul[18] = {0};
244 multiply9x9(res, fac, mul);
250 multiply9x9(fac, fac, mul);