1 Stars and Bars (Unconstrained / Simple Bounds)
Count non-negative integer solutions of
$$x_1 + x_2 + \cdots + x_k = n, \quad x_i \ge 0.$$Model. Represent the $n$ units as $n$ identical balls placed into $k$ labeled bins. Insert $k-1$ dividers to separate bins. Together the $n$ balls and $k-1$ dividers occupy $n+k-1$ slots; choosing which $k-1$ slots hold dividers determines the solution:
$$\boxed{\binom{n+k-1}{k-1}}$$Strictly positive ($x_i \ge 1$): substitute $y_i = x_i - 1 \ge 0$, so $\sum y_i = n - k$. Requires $n \ge k$; formula:
$$\binom{n-1}{k-1}$$Lower bounds ($x_i \ge a_i$): substitute $y_i = x_i - a_i \ge 0$, so $\sum y_i = n - \sum a_i$. Requires $n \ge \sum a_i$; formula:
$$\binom{n - \sum_{i=1}^k a_i + k - 1}{k-1}$$At most $n$ ($x_1 + \cdots + x_k \le n$, each $x_i \ge 0$):
- Add a slack variable $Z \ge 0$, so $x_1 + \cdots + x_k + Z = n$.
- Now $k+1$ variables summing to $n$: $$\binom{n+k}{k}$$
2 Bounded Diophantine — Inclusion-Exclusion Formula
Count non-negative integer solutions of
$$x_1 + x_2 + \cdots + x_k = n, \quad l_i \le x_i \le r_i.$$2.1 Setup
Shift to zero lower-bounds. Let $y_i = x_i - l_i \ge 0$, $U_i = r_i - l_i$, $N = n - \sum l_i$.
- If $N < 0$ or $N > \sum U_i$: answer is $0$.
- Otherwise count: $y_1 + \cdots + y_k = N$, $0 \le y_i \le U_i$.
2.2 Inclusion-Exclusion
Let $A_i$ be the “bad” event $y_i \ge U_i + 1$ (upper bound violated).
Unconstrained count ($y_i \ge 0$, no upper bound): $\binom{N+k-1}{k-1}$.
Forced violation of subset $T$. If every $i \in T$ satisfies $y_i \ge U_i + 1$, substitute $z_i = y_i - (U_i+1) \ge 0$. The new target sum is $N - \sum_{i\in T}(U_i+1)$. Count:
$$\binom{N - \sum_{i\in T}(U_i+1) + k - 1}{k-1}$$where $M = N - \sum_{i\in T}(U_i+1)$; the term is $0$ when $M < 0$.
Inclusion-exclusion over all subsets:
$$\boxed{\#\ =\ \sum_{T \subseteq [k]} (-1)^{|T|} \binom{N - \sum_{i\in T}(U_i+1) + k-1}{k-1}}$$Direct evaluation: $O(k \cdot 2^k)$.
3 Prefix-Sum DP — $O(kN)$
Define $dp[i][s]$ = number of ways to make sum $s$ using the first $i$ variables $y_1, \dots, y_i$ (each in $[0, U_i]$).
Base: $dp[0][0] = 1$.
Transition. $y_i$ contributes any value in $[0, U_i]$, so:
$$dp[i][s] = \sum_{v=0}^{\min(s,U_i)} dp[i-1][s-v] = \mathrm{pref}[s] - \mathrm{pref}[s - U_i - 1]$$where $\mathrm{pref}[t] = \sum_{u=0}^{t} dp[i-1][u]$ and $\mathrm{pref}[-1] = 0$.
Rolling arrays reduce space to $O(N)$.
Answer: $dp[k][N]$ (mod $10^9+7$).
If $N$ itself exceeds available memory or time budget, this DP is not applicable — use the NTT method instead.
#include <bits/stdc++.h>
using namespace std;
const int MOD = 1000000007;
inline int add(int a,int b){ a+=b; if(a>=MOD) a-=MOD; return a; }
int main(){
ios::sync_with_stdio(false); cin.tie(nullptr);
int k; long long n; cin>>k>>n;
vector<long long> L(k),R(k);
for(int i=0;i<k;++i) cin>>L[i]>>R[i];
long long sumL=0,sumU=0;
vector<long long> U64(k);
for(int i=0;i<k;++i){
sumL+=L[i];
long long ui=R[i]-L[i];
if(ui<0){ cout<<0<<"\n"; return 0; }
U64[i]=ui; sumU+=ui;
}
long long N64=n-sumL;
if(N64<0||N64>sumU){ cout<<0<<"\n"; return 0; }
int N=(int)N64;
// clamp each U_i to N — anything larger never affects the DP
vector<int> U(k);
for(int i=0;i<k;++i) U[i]=(int)min(U64[i],N64);
vector<int> prev(N+1,0),cur(N+1,0),pref(N+1,0);
prev[0]=1; int reach=0;
for(int i=0;i<k;++i){
reach=min(N,reach+U[i]);
int run=0;
for(int s=0;s<=reach;++s){ run=add(run,prev[s]); pref[s]=run; }
for(int s=0;s<=reach;++s){
int left=s-U[i]-1;
int val=pref[s];
if(left>=0){ val-=pref[left]; if(val<0) val+=MOD; }
cur[s]=val;
}
for(int s=reach+1;s<=N;++s) cur[s]=0;
swap(prev,cur);
}
cout<<prev[N]<<"\n"; // answer mod 10^9+7
}
4 Divide-and-Conquer NTT — Generating Function Method
Each variable $y_i \in [0, U_i]$ corresponds to the generating polynomial
$$P_i(x) = 1 + x + x^2 + \cdots + x^{U_i}.$$The answer is $[x^N]\displaystyle\prod_{i=1}^k P_i(x)$.
Strategy. Merge the $k$ polynomials with divide-and-conquer; at each merge truncate to degree $N$ to avoid unnecessary growth.
Complexity. Each merge node $v$ performs one convolution of cost $O(d_v \log d_v)$ where $d_v$ is the effective polynomial length at that node (capped at $N+1$). Total cost:
$$O\!\left(\sum_{\text{merge node }v} d_v \log d_v\right)$$Since every truncation keeps $d_v \le N+1$, in the common case where polynomial lengths grow gradually across levels this is approximately $O(N \log N \log k)$. In the worst case (e.g. all $k$ polynomials already of length $\Theta(N)$ at the leaves) it can reach $O(kN \log N)$.
#include <bits/stdc++.h>
using namespace std;
const int MOD = 998244353;
const int G = 3;
long long pw(long long a, long long b, long long mod = MOD) {
long long r = 1; a %= mod;
for (; b > 0; b >>= 1) { if (b & 1) r = r * a % mod; a = a * a % mod; }
return r;
}
void ntt(vector<int>& a, bool inv) {
int n = a.size();
for (int i = 1, j = 0; i < n; ++i) {
int bit = n >> 1;
for (; j & bit; bit >>= 1) j ^= bit;
j ^= bit;
if (i < j) swap(a[i], a[j]);
}
for (int len = 2; len <= n; len <<= 1) {
long long w = inv ? pw(G, MOD - 1 - (MOD - 1) / len)
: pw(G, (MOD - 1) / len);
for (int i = 0; i < n; i += len) {
long long wn = 1;
for (int j = 0; j < len / 2; ++j) {
int u = a[i + j], v = (long long)a[i + j + len/2] * wn % MOD;
a[i + j] = u + v >= MOD ? u + v - MOD : u + v;
a[i + j + len/2] = u - v < 0 ? u - v + MOD : u - v;
wn = wn * w % MOD;
}
}
}
if (inv) {
long long inv_n = pw(n, MOD - 2);
for (auto& x : a) x = (long long)x * inv_n % MOD;
}
}
// multiply A and B, keep only coefficients [0..lim]
vector<int> conv_trunc(vector<int> A, vector<int> B, int lim) {
if (A.empty() || B.empty()) return {};
if ((int)A.size() > lim + 1) A.resize(lim + 1);
if ((int)B.size() > lim + 1) B.resize(lim + 1);
int need = (int)A.size() + (int)B.size() - 1;
int n = 1; while (n < need) n <<= 1;
A.resize(n); B.resize(n);
ntt(A, false); ntt(B, false);
for (int i = 0; i < n; ++i) A[i] = (long long)A[i] * B[i] % MOD;
ntt(A, true);
A.resize(min(need, lim + 1));
return A;
}
// D&C: product of polys[l..r), coefficients truncated to degree N
vector<int> dc(vector<vector<int>>& polys, int l, int r, int N) {
if (r - l == 1) return polys[l];
int m = (l + r) >> 1;
return conv_trunc(dc(polys, l, m, N), dc(polys, m, r, N), N);
}
int main() {
ios::sync_with_stdio(false); cin.tie(nullptr);
int k; long long n; cin >> k >> n;
vector<long long> L(k), R(k);
for (int i = 0; i < k; ++i) cin >> L[i] >> R[i];
long long sumL = 0, sumU = 0;
vector<long long> U64(k);
for (int i = 0; i < k; ++i) {
sumL += L[i];
long long ui = R[i] - L[i];
if (ui < 0) { cout << 0 << "\n"; return 0; }
U64[i] = ui; sumU += ui;
}
long long N64 = n - sumL;
if (N64 < 0 || N64 > sumU) { cout << 0 << "\n"; return 0; }
int N = (int)N64;
// P_i(x) = 1 + x + ... + x^{min(U_i, N)} (higher terms never contribute)
vector<vector<int>> polys(k);
for (int i = 0; i < k; ++i) {
int len = (int)min(U64[i], (long long)N) + 1;
polys[i].assign(len, 1);
}
auto res = dc(polys, 0, k, N);
cout << (N < (int)res.size() ? res[N] : 0) << "\n"; // answer mod 998244353
return 0;
}
5 Equal-Cap Case
$n$ variables each in $[0, k]$, sum $= S$.
Let $A_j$: variable $j$ exceeds $k$ (i.e. $y_j \ge k+1$). By symmetry every size-$i$ intersection has the same count, so:
$$\boxed{g(n,S,k)=\sum_{i=0}^{\min\!\left(n,\,\lfloor S/(k+1)\rfloor\right)}(-1)^i\binom{n}{i}\binom{S-i(k+1)+n-1}{n-1}}$$The upper limit is $\min\!\bigl(n,\lfloor S/(k+1)\rfloor\bigr)$: at most $n$ variables can violate the bound, and terms with $S-i(k+1)<0$ vanish. Complexity:
$$O\!\left(\min\!\left(n,\left\lfloor\frac{S}{k+1}\right\rfloor\right)\right)$$Special case. If $k \ge S$, no upper bound is ever tight: answer $= \dbinom{S+n-1}{n-1}$.
Boundary. If $S < 0$ or $S > nk$, the answer is $0$.
6 Example — CF 2127F
Combinatorics + Inclusion-Exclusion + Hockey-Stick Identity
6.1 Linearise the Sum
After substituting the sum variable, both $A(M)$ and $B(M)$ below reduce to sums of the form
$$\mathrm{Cnt}(k,\,y,\,M)=\#\bigl\{k\text{ variables in }[0,M]\text{ summing to }y\bigr\}$$weighted by $y$ (or shifted versions thereof). Concretely:
- Let $k_1=n-2$, window $[L_A,U_A]=[\max(0,S-(M-1)),\,\min(S,k_1 M)]$, $$A(M)=\sum_{y=L_A}^{U_A}(y+M-S)\cdot\mathrm{Cnt}(k_1,y,M).$$
- Let $k_2=n-3$, window $[L_B,U_B]=[\max(0,S-2M+1),\,\min(S-M,k_2 M)]$, $$B(M)=\sum_{y=L_B}^{U_B}(y+2M-S)\cdot\mathrm{Cnt}(k_2,y,M).$$
6.2 Problem Statement
Given integers $n$ and $m$. A snake array $a_1,\dots,a_n$ satisfies:
- $0\le a_i\le m$;
- $\sum a_i = m$;
- $a_n = \max(a_1,\dots,a_n)$.
Define function $f(a)$ as follows (pseudocode): starting at $\mathrm{pos}=1$, if $a_{\mathrm{pos}}\lt a_n$ jump to the first position to the right with a strictly larger value and add the difference to the answer; otherwise increment $\mathrm{pos}$. Compute the sum of $f(a)$ over all snake arrays, modulo $10^9+7$.
6.3 Algorithm
6.3.1 Simplify $f(a)$
Let $M=a_n$. Tracing through the process: whenever we are at some position $i$ with $a_i \lt M$, we keep jumping to the “next greater” until we reach $a_n$, accumulating $M - a_i$.
Which positions $i$ serve as “segment heads”? Exactly those satisfying:
- $i=1$ and $a_1 \lt M$, or
- $i\ge 2$, $a_{i-1}=M$, and $a_i \lt M$.
This gives the clean formula:
$$f(a)=\sum_{i=1}^{n-1}(M-a_i)\cdot\bigl[a_i\lt M\;\wedge\;(i=1\text{ or }a_{i-1}=M)\bigr].$$6.3.2 Enumerate the Maximum $M$
The snake-array conditions are equivalent to: fix $M\in[0,m]$ as the maximum, let the first $n-1$ entries satisfy $0\le a_i\le M$ and $\sum_{i=1}^{n-1}a_i = S:=m-M$, and set $a_n=M$.
For fixed $M$, linearity lets us split the contribution into two types:
- Position $i=1$: $$A(M)=\sum_{x=0}^{M-1}(M-x)\cdot\#\bigl\{(a_2,\dots,a_{n-1})\in[0,M]^{n-2}:\textstyle\sum a=S-x\bigr\}.$$
- Position $i\ge2$ (requires $a_{i-1}=M$): $$B(M)=\sum_{x=0}^{M-1}(M-x)\cdot\#\bigl\{n-3\text{ variables in }[0,M]:\textstyle\sum a=S-M-x\bigr\}.$$
By symmetry every $i\in\{2,\dots,n-1\}$ contributes equally, so the total contribution from these positions is $(n-2)\cdot B(M)$.
The answer is therefore:
$$\mathrm{Ans}=\sum_{M=0}^{m}\bigl(A(M)+(n-2)\,B(M)\bigr).$$6.3.3 Count $t$ Ordered Integers in $[0,k]$ Summing to $S$
1. Unconstrained (stars and bars). If $x_i\ge0$ with no upper bound, the count is $\binom{S+t-1}{t-1}$.
2. Remove the upper bound by inclusion-exclusion.
$$\#\text{valid}=\sum_{j=0}^{t}(-1)^j\binom{t}{j}\cdot\#\bigl\{\text{solutions with the chosen }j\text{ variables}\ge k+1\bigr\}.$$Subtract $k+1$ from each chosen variable; the remaining $t$ variables are non-negative with total $S-j(k+1)$. This gives the standard formula:
$$\boxed{\mathrm{Cnt}(t,S,k)=\begin{cases}\displaystyle\sum_{j=0}^{\min(t,\,\lfloor S/(k+1)\rfloor)}(-1)^j\binom{t}{j}\binom{S-j(k+1)+t-1}{t-1}, & t\ge1,\\[6pt]1 & (t=0\text{ and }S=0),\\[4pt]0 & (t=0\text{ and }S\ne0).\end{cases}}$$Binomial terms with a negative lower index are treated as 0. For each evaluation the upper limit on $j$ is $\lfloor S/(k+1)\rfloor$, so the cost per call is $O(S/k)$.
Expanding $\mathrm{Cnt}$ and substituting into $A(M)$:
$$A(M)=\sum_{x=0}^{M-1}(M-x)\sum_{j\ge0}(-1)^j\binom{k}{j}\binom{S-x-j(M+1)+r}{r}.$$Why the naive triple loop is too slow.
- Outer loop: $M=0,\dots,m$.
- Middle loop (per $M$): $x=0,\dots,\min(M-1,S)$ — roughly $O(M)$ iterations.
- Inner loop (per $(M,x)$): inclusion-exclusion over $j=0,\dots,\lfloor(S-x)/(M+1)\rfloor$.
For fixed $M$, the middle × inner work is approximately $O\!\left(M\cdot\frac{S}{M+1}\right)$. When $S\approx m$ this is $O(m)$ per $M$, making the total $O(m^2)$ — too slow for $m\le2\cdot10^5$.
6.4 Interchange the Summations
Because both sums are finite (the inner $j$-sum terminates when $S-x-j(M+1)\lt0$), we may swap the order of summation:
$$A(M)=\sum_{j\ge0}(-1)^j\binom{k}{j}\sum_{x=0}^{M-1}(M-x)\binom{S-x-j(M+1)+r}{r}.$$(Apply the same transformation to $B(M)$, replacing $k$ with $n-3$ and $S$ with $S-M$.)
6.5 Substitute Variables — Convert to an Interval Sum over $t$
Fix $j$ and write the inner sum as:
$$I_j:=\sum_{x=0}^{M-1}(M-x)\binom{S-x-j(M+1)+r}{r}.$$Substitute $t := S-j(M+1)-x$, i.e. $x = S-j(M+1)-t$. As $x$ runs from $0$ to $M-1$, $t$ runs from $S-j(M+1)$ down to $S-j(M+1)-M+1$. Truncating to the non-negative part:
$$a:=\max\bigl(0,\;S-j(M+1)-(M-1)\bigr),\qquad b:=S-j(M+1).$$If $b\lt0$ the term is 0 (use this as an early-exit condition when increasing $j$).
After substitution, $M-x = (M-S+j(M+1))+t$, so:
$$I_j=\sum_{t=a}^{b}\bigl((M-S)+j(M+1)+t\bigr)\binom{t+r}{r}.$$Splitting the constant from the $t$-term:
$$I_j=\bigl(M-S+j(M+1)\bigr)\underbrace{\sum_{t=a}^{b}\binom{t+r}{r}}_{T_0(a,b)}+\underbrace{\sum_{t=a}^{b}t\binom{t+r}{r}}_{T_1(a,b)}.$$6.6 Evaluate $T_0$ and $T_1$ via Hockey Stick
Let $r=k-1$.
$T_0$ (Hockey Stick):
$$T_0(a,b)=\sum_{t=a}^{b}\binom{t+r}{r}=\binom{b+r+1}{r+1}-\binom{a+r}{r+1}.$$($T_0=0$ if $a>b$; the second term vanishes when $a=0$.)
$T_1$ — use the identity $t\binom{t+r}{r}=(r+1)\binom{t+r}{r+1}$:
$$T_1(a,b)=\sum_{t=a}^{b}t\binom{t+r}{r}=(r+1)\sum_{t=a}^{b}\binom{t+r}{r+1}=(r+1)\!\left(\binom{b+r+1}{r+2}-\binom{a+r}{r+2}\right).$$Therefore:
$$I_j=\bigl(M-S+j(M+1)\bigr)\,T_0(a,b)+T_1(a,b),$$and substituting back:
$$A(M)=\sum_{j\ge0}(-1)^j\binom{k}{j}\Bigl(\bigl(M-S+j(M+1)\bigr)T_0(a,b)+T_1(a,b)\Bigr).$$Each $T_0,T_1$ is a difference of two binomial coefficients — $O(1)$ per term.
6.7 Same Treatment for $B(M)$
Apply the identical steps to
$$B(M)=\sum_{x=0}^{M-1}(M-x)\cdot\mathrm{Cnt}(k',\,S-M-x,\,M),\qquad k'=n-3,$$using $S\leftarrow S-M$ throughout. The only change is the constant factor in $M-x$: it becomes $(2M-S)+j(M+1)$.
6.8 Key Takeaways
- Linearise complex processes into “segment-head” contributions — reduces a complicated traversal to a simple weighted sum.
- Swap summation order when both sums are finite — turns a triple loop into $O(1)$ per $(M,j)$ pair after precomputing binomials.
- Hockey Stick + the identity $t\binom{t+r}{r}=(r+1)\binom{t+r}{r+1}$ — closes interval sums of the form $\sum t^{\le1}\binom{t+r}{r}$ in $O(1)$.
#include <bits/stdc++.h>
using namespace std;
using i64 = long long;
const int MOD = 1'000'000'007;
i64 qp(i64 a, i64 e = MOD - 2) {
i64 r = 1;
while (e) {
if (e & 1) r = r * a % MOD;
a = a * a % MOD; e >>= 1;
}
return r;
}
int main() {
ios::sync_with_stdio(false);
cin.tie(nullptr);
int T; cin >> T;
vector<pair<int,int>> qs(T);
int maxn = 0, maxm = 0;
for (int t = 0; t < T; ++t) {
int n, m; cin >> n >> m;
qs[t] = {n, m};
maxn = max(maxn, n);
maxm = max(maxm, m);
}
int LIM = maxn + maxm + 5;
vector<i64> fac(LIM + 1), ifac(LIM + 1);
fac[0] = 1;
for (int i = 1; i <= LIM; ++i) fac[i] = fac[i-1] * i % MOD;
ifac[LIM] = qp(fac[LIM]);
for (int i = LIM; i >= 1; --i) ifac[i-1] = ifac[i] * i % MOD;
auto C = [&](i64 n, i64 k) -> i64 {
if (n < 0 || k < 0 || k > n) return 0;
if (n > LIM) return 0;
return fac[n] * ifac[k] % MOD * ifac[n-k] % MOD;
};
// Returns (sum0, sum1) = ( sum Cnt(k,y;M), sum y*Cnt(k,y;M) ) for y in [L..U]
// where Cnt(k,y;M) = #{k variables in [0,M] summing to y}
auto windowed = [&](int k, int M, i64 L, i64 U) -> pair<i64,i64> {
if (L > U || M < 0) return {0, 0};
if (k == 0) {
i64 s0 = (0 >= L && 0 <= U) ? 1 : 0;
return {s0, 0};
}
i64 sum0 = 0, sum1 = 0;
int r = k - 1;
if (U < 0) return {0, 0};
i64 jmax = U / (M + 1);
for (i64 j = 0; j <= jmax; ++j) {
i64 a = L - j * (M + 1);
i64 b = U - j * (M + 1);
if (b < 0) break;
if (a < 0) a = 0;
if (a > b) continue;
// T0 = C(b+r+1, r+1) - C(a-1+r+1, r+1)
i64 s0 = (C(b + r + 1, r + 1) - C(a - 1 + r + 1, r + 1)) % MOD;
if (s0 < 0) s0 += MOD;
// T1 = (r+1) * (C(b+r+1, r+2) - C(a-1+r+1, r+2))
i64 s1t = (i64)(r + 1) * ((C(b + r + 1, r + 2) - C(a - 1 + r + 1, r + 2)) % MOD) % MOD;
if (s1t < 0) s1t += MOD;
i64 coef = C(k, j);
if (j & 1) coef = (MOD - coef) % MOD;
// y = t + j*(M+1), so sum1 gets s1t + j*(M+1)*s0
i64 part = (s1t + (j % MOD) * ((M + 1) % MOD) % MOD * s0) % MOD;
sum0 = (sum0 + coef * s0) % MOD;
sum1 = (sum1 + coef * part) % MOD;
}
return {sum0, sum1};
};
for (auto [n, m] : qs) {
i64 ans = 0;
for (int M = 0; M <= m; ++M) {
int S = m - M;
// A(M): k1 = n-2, window [max(0,S-(M-1)) .. min(S, k1*M)]
i64 A = 0;
int k1 = n - 2;
if (M > 0) {
i64 LA = max(0, S - (M - 1));
i64 UA = (k1 >= 1) ? min<i64>(S, 1LL * k1 * M) : min<i64>(S, 0);
auto [s0A, s1A] = windowed(k1, M, LA, UA);
A = (s1A + ((M - S + 0LL) % MOD + MOD) % MOD * s0A) % MOD;
}
// B(M): k2 = n-3, window [max(0,S-2M+1) .. min(S-M, k2*M)]
i64 B = 0;
int k2 = n - 3;
if (M > 0 && k2 >= 0) {
i64 LB = max(0, S - 2 * M + 1);
i64 UB = (k2 >= 1) ? min<i64>(S - M, 1LL * k2 * M) : min<i64>(S - M, 0);
if (LB <= UB) {
auto [s0B, s1B] = windowed(k2, M, LB, UB);
B = (s1B + ((2LL * M - S) % MOD + MOD) % MOD * s0B) % MOD;
}
}
ans = (ans + A + ((n - 2 + 0LL) % MOD) * B) % MOD;
}
cout << ans % MOD << "\n";
}
return 0;
}