1 Matrix Multiplication & Fast Power
1.1 Matrix Multiplication
For two $n\times n$ matrices $A=(a_{ij})$ and $B=(b_{ij})$, their product $C=AB$ satisfies:
$$c_{ij}=\sum_{k=1}^{n}a_{ik}b_{kj}$$Power definition: $A^1=A$, $A^n=A^{n-1}\cdot A$.
1.2 Matrix Fast Power for Linear Recurrences
The key technique: encode a linear recurrence as a state vector multiplied by a constant transition matrix, then use fast exponentiation to reach $f(n)$ in $O(k^3\log n)$.
Pattern 1: Basic $k$-order recurrence
$$f(n)=a_1 f(n-1)+a_2 f(n-2)+\cdots+a_k f(n-k)$$$$\begin{pmatrix}f(n)\\f(n-1)\\\vdots\\f(n-k+1)\end{pmatrix} =\begin{pmatrix}a_1&a_2&\cdots&a_{k-1}&a_k\\1&0&\cdots&0&0\\0&1&\cdots&0&0\\\vdots&&\ddots&&\vdots\\0&0&\cdots&1&0\end{pmatrix} \begin{pmatrix}f(n-1)\\f(n-2)\\\vdots\\f(n-k)\end{pmatrix}$$Pattern 2: Recurrence with constant term
$$f(n)=a_1 f(n-1)+a_2 f(n-2)+C$$$$\begin{pmatrix}f(n)\\f(n-1)\\1\end{pmatrix} =\begin{pmatrix}a_1&a_2&C\\1&0&0\\0&0&1\end{pmatrix} \begin{pmatrix}f(n-1)\\f(n-2)\\1\end{pmatrix}$$Pattern 3: Recurrence with polynomial forcing term
$$f(n)=a_1 f(n-1)+a_2 f(n-2)+c_2 n^2+c_1 n+c_0$$Augment the state with $(n+1)^2$, $n+1$, $1$ and use $(n+1)^2=n^2+2n+1$:
$$\begin{pmatrix}f(n)\\f(n-1)\\(n+1)^2\\n+1\\1\end{pmatrix} =\begin{pmatrix}a_1&a_2&c_2&c_1&c_0\\1&0&0&0&0\\0&0&1&2&1\\0&0&0&1&1\\0&0&0&0&1\end{pmatrix} \begin{pmatrix}f(n-1)\\f(n-2)\\n^2\\n\\1\end{pmatrix}$$Pattern 4: Mutually dependent recurrences
$$f(n)=a_{11}f(n-1)+a_{12}g(n-1),\quad g(n)=a_{21}f(n-1)+a_{22}g(n-1)$$$$\begin{pmatrix}f(n)\\g(n)\end{pmatrix} =\begin{pmatrix}a_{11}&a_{12}\\a_{21}&a_{22}\end{pmatrix} \begin{pmatrix}f(n-1)\\g(n-1)\end{pmatrix}$$Summary of matrix fast power applications:
- Linear recurrences (the core use case)
- Matrix fast power requires a constant matrix — the transition matrix must not change with $n$
- Directed graph path counting: $(G^n)_{ij}$ = number of paths from $i$ to $j$ of exactly $n$ steps
Example: CF 1117D — Valid 01 Strings
Count length-$N$ binary strings where every maximal all-zero run has length divisible by $M$. $(1\leq N\leq10^{18},\ 2\leq M\leq100)$
Observation: valid strings are built from two operations — append a “1”, or append a zero block of length exactly $M$.
$$f(n)=f(n-1)+f(n-M)\quad(f(0)=f(1)=\cdots=f(M-1)=1)$$Matrix model: define the state vector $\mathbf{F}(n)=[f(n),f(n-1),\ldots,f(n-M+1)]^\top$, and the $M\times M$ transition matrix:
$$A=\begin{bmatrix}1&0&\cdots&0&1\\1&0&\cdots&0&0\\0&1&\cdots&0&0\\\vdots&&\ddots&&\vdots\\0&0&\cdots&1&0\end{bmatrix}$$(First row encodes $f(n+1)=f(n)+f(n+1-M)$; remaining rows shift the state down.)
Initial vector: $\mathbf{F}(M-1)=[1,1,\ldots,1]^\top$.
Answer: $f(N)=(\mathbf{F}(N))_1=A^{N-(M-1)}\mathbf{F}(M-1)$ at index 0.
Complexity: $O(M^3\log N)$.
int main(){
ll n; int m;
cin >> n >> m;
if (n < m) { cout << 1 << '\n'; return 0; }
if (n == m) { cout << 2 << '\n'; return 0; }
Mat A; A.assign(m, vector<int>(m));
A[0][0] = 1;
A[0][m-1] = 1;
for(int i = 1; i < m; i++) A[i][i-1] = 1;
vector<int> F(m, 1); // initial vector F(m-1) = [1,1,...,1]^T
auto jm = mpow_vec(A, n-(m-1), F);
cout << jm[0] << '\n';
return 0;
}
2 Matrix Fast Power for DP — Four Semiring Types
Type ①: Counting (standard ring, mod $M$)
Semantics: number of paths/arrangements.
Semiring: $(\mathbb{Z}_M, +, \times, 0, 1)$
#include <bits/stdc++.h>
using namespace std;
using ll = long long;
const int M = 1000000007;
using Mat = vector<vector<int>>;
Mat id(int n){
Mat I(n, vector<int>(n, 0));
for(int i = 0; i < n; i++) I[i][i] = 1;
return I;
}
Mat mul(const Mat& A, const Mat& B){
int n = (int)A.size();
Mat C(n, vector<int>(n, 0));
for(int i = 0; i < n; i++){
for(int k = 0; k < n; k++){
int aik = A[i][k];
if(!aik) continue;
ll t = aik;
for(int j = 0; j < n; j++)
C[i][j] = (C[i][j] + t * B[k][j]) % M;
}
}
return C;
}
Mat mpow(Mat A, unsigned long long e){
int n = (int)A.size();
Mat R = id(n);
while(e){
if(e & 1ULL) R = mul(R, A);
A = mul(A, A);
e >>= 1ULL;
}
return R;
}
// compute A * x (matrix-vector)
vector<int> mul_vec(const Mat& A, const vector<int>& x){
int n = (int)A.size();
vector<int> y(n, 0);
for(int i = 0; i < n; i++){
long long s = 0;
for(int j = 0; j < n; j++)
if(A[i][j]) s = (s + 1LL * A[i][j] * x[j]) % M;
y[i] = (int)s;
}
return y;
}
// compute A^e * x0 (O(n^2 log e), saves one dimension vs. full matrix power)
vector<int> mpow_vec(Mat A, unsigned long long e, vector<int> x){
while(e){
if(e & 1ULL) x = mul_vec(A, x);
A = mul(A, A);
e >>= 1ULL;
}
return x;
}
Example: Nowcoder 17890 — Grid Coloring
$m\times n$ grid; each cell is black (1) or white (0). Constraints: no two horizontally adjacent cells are both white; no two adjacent columns are both all-black. Count valid colourings. $(1\leq m\leq5,\ 1\leq n\leq10^{18})$
Bitmask DP: encode each column as an $m$-bit mask (bit=1 → black, bit=0 → white).
Constraint 1 (no two adjacent white cells in same row): for adjacent columns st' (prev) and st (curr), every row has at least one black cell → (st | st') == ALL where ALL = (1<<m)-1.
Constraint 2 (no two adjacent all-black columns): not (st==ALL && st'==ALL).
Transition matrix $A$ of size $2^m\times2^m$:
$$A[\text{st}][\text{st}'] = 1 \iff (\text{st}\mid\text{st}')==\text{ALL} \;\text{ and }\; \neg(\text{st}=\text{st}'=\text{ALL})$$(Visual: the matrix rows are indexed by current column mask st, columns by previous mask st’. Entry is 1 iff both constraints hold.)
Complexity: $O((2^m)^3\log n)$.
void solve() {
int m; unsigned long long n;
cin >> m >> n;
int S = 1 << m, ALL = S - 1;
Mat A; A.assign(S, vector<int>(S));
for(int stp = 0; stp < S; ++stp)
for(int st = 0; st < S; ++st)
if(((st | stp) == ALL) && !(st == ALL && stp == ALL))
A[st][stp] = 1;
vector<int> x0(S, 1);
auto jm = mpow_vec(A, n-1, x0);
ll ans = 0;
for(int i = 0; i < S; i++){ ans += jm[i]; ans %= M; }
cout << ans << endl;
}
Example: LeetCode — Number of ZigZag Arrays II
Array of length $n$, elements in $[l,r]$; adjacent elements unequal; no three consecutive elements strictly increasing or decreasing. Count valid arrays mod $10^9+7$. ($3\leq n\leq10^9$, $1\leq l\lt r\leq75$)
Reduction: shift values to $\{1,\ldots,K\}$ with $K=r-l+1$ (only length matters). Valid arrays are one of two alternating patterns: up-down or down-up.
With $t=n-1$ steps, define two $K\times K$ matrices:
$$U_{i,j}=[i>j]\quad(\text{go up: current }\gt\text{ previous})$$$$D_{i,j}=[i\lt j]\quad(\text{go down: current }\lt\text{ previous})$$For the “up-start” pattern:
- $t$ even: operator $(DU)^{t/2}$
- $t$ odd: operator $U\cdot(DU)^{\lfloor t/2\rfloor}$
For the “down-start” pattern: swap $U$ and $D$.
Apply each operator to the all-ones initial vector $x_0$, sum all components.
Complexity: $O(K^3\log n)$ where $K\leq75$.
Type ②: Boolean Reachability (OR-AND semiring)
Semantics: does there exist a path/sequence of length exactly $n$?
Semiring: $(\{0,1\},\lor,\land,0,1)$
Note: computing $A^k$ with standard integer arithmetic and thresholding at $>0$ gives reachability too — but the boolean version avoids overflow.
Useful identity: $\leq k$ step reachability via $(I\lor A)^k = I\lor A\lor A^2\lor\cdots\lor A^k$ (OR is idempotent, so boolean fast power works directly).
Up to 64 states — bitmask encoding, $O(n^3/64)$
using ull = unsigned long long;
struct BMat {
int n;
vector<ull> r; // r[i] is a bitmask: bit j = A[i][j]
BMat(): n(0) {}
explicit BMat(int n_): n(n_), r(n_, 0ULL) {}
};
BMat bid(int n){
assert(n <= 64);
BMat I(n);
for(int i = 0; i < n; i++) I.r[i] |= (1ULL << i);
return I;
}
// C = A (*) B (boolean: C[i][j] = OR_k (A[i][k] AND B[k][j]))
// O(n^3 / 64) via 64-bit AND
BMat bmul(const BMat& A, const BMat& B){
assert(A.n == B.n);
int n = A.n;
BMat C(n);
vector<ull> col(n, 0ULL);
for(int j = 0; j < n; j++){
ull m = 0ULL;
for(int k = 0; k < n; k++)
if((B.r[k] >> j) & 1ULL) m |= (1ULL << k);
col[j] = m;
}
for(int i = 0; i < n; i++){
const ull ai = A.r[i];
if(!ai) continue;
for(int j = 0; j < n; j++)
if(ai & col[j]) C.r[i] |= (1ULL << j);
}
return C;
}
BMat bmpow(BMat A, ull e){
BMat R = bid(A.n);
while(e){ if(e & 1ULL) R = bmul(R, A); A = bmul(A, A); e >>= 1ULL; }
return R;
}
ull bmul_vec(const BMat& A, ull x){
int n = A.n; ull y = 0ULL;
for(int i = 0; i < n; i++)
if(A.r[i] & x) y |= (1ULL << i);
return y;
}
ull bmpow_vec(BMat A, ull e, ull x){
while(e){ if(e & 1ULL) x = bmul_vec(A, x); A = bmul(A, A); e >>= 1ULL; }
return x;
}
void bset(BMat& A, int i, int j){ A.r[i] |= (1ULL << j); }
bool bget(const BMat& A, int i, int j){ return (A.r[i] >> j) & 1ULL; }
Arbitrary $n$ — blocked bitmask, $O(n^3/64)$
using ull = unsigned long long;
struct BM {
int n, b; // b = ceil(n/64) blocks
ull mask; // valid bits in the last block
vector<vector<ull>> a; // a[i][t]: block t of row i
BM(): n(0), b(0), mask(0) {}
explicit BM(int n_): n(n_) {
b = (n + 63) >> 6;
a.assign(n, vector<ull>(b, 0ULL));
int r = n & 63;
mask = (b == 0 ? 0ULL : (r ? ((1ULL<<r)-1ULL) : ~0ULL));
}
static BM id(int n){
BM I(n);
for(int i = 0; i < n; ++i) I.a[i][i>>6] |= (1ULL << (i & 63));
if(I.b) for(int i = 0; i < n; ++i) I.a[i].back() &= I.mask;
return I;
}
inline void set(int i, int j){ a[i][j>>6] |= (1ULL << (j & 63)); }
inline bool get(int i, int j) const { return (a[i][j>>6] >> (j & 63)) & 1; }
};
BM bmul(const BM& A, const BM& B){
int n = A.n, b = A.b;
BM C(n);
for(int i = 0; i < n; ++i){
auto &row = C.a[i];
for(int t = 0; t < b; ++t){
ull w = A.a[i][t];
while(w){
int bit = __builtin_ctzll(w);
int k = (t<<6) + bit;
if(k < n) for(int u = 0; u < b; ++u) row[u] |= B.a[k][u];
w &= w - 1;
}
}
if(b) row[b-1] &= C.mask;
}
return C;
}
BM bmpow(BM A, unsigned long long e){
BM R = BM::id(A.n);
while(e){ if(e & 1ULL) R = bmul(R, A); A = bmul(A, A); e >>= 1ULL; }
return R;
}
// y = A * x (column-vector semantics: predecessors of x in one step)
std::vector<ull> bmul_vec(const BM& A, const std::vector<ull>& x){
int n = A.n, b = A.b;
std::vector<ull> y(b, 0ULL);
for(int i = 0; i < n; ++i){
bool hit = false;
for(int t = 0; t < b; ++t) if(A.a[i][t] & x[t]){ hit = true; break; }
if(hit) y[i>>6] |= (1ULL << (i & 63));
}
if(b) y[b-1] &= A.mask;
return y;
}
std::vector<ull> bmpow_vec(BM A, unsigned long long e, std::vector<ull> x){
while(e){ if(e & 1ULL) x = bmul_vec(A, x); A = bmul(A, A); e >>= 1ULL; }
return x;
}
// forward step: y = x * A (expand from source set outward)
std::vector<ull> bmul_vec_forward(const BM& A, const std::vector<ull>& x){
int b = A.b, n = A.n;
std::vector<ull> y(b, 0ULL);
for(int i = 0; i < n; ++i)
if((x[i>>6] >> (i & 63)) & 1ULL)
for(int u = 0; u < b; ++u) y[u] |= A.a[i][u];
if(b) y[b-1] &= A.mask;
return y;
}
Type ③: Shortest Cost (min-plus semiring)
Semantics: minimum total cost of exactly $n$ steps.
Semiring: $(\mathbb{L}\cup\{\infty\},\min,+,\infty,0)$
#include <bits/stdc++.h>
using namespace std;
using ll = long long;
using Mat = vector<vector<ll>>;
const ll INF = (1LL<<60);
Mat minplus_mul(const Mat& A, const Mat& B){
int n = (int)A.size();
Mat C(n, vector<ll>(n, INF));
for(int i = 0; i < n; i++)
for(int k = 0; k < n; k++) if(A[i][k] < INF){
ll aik = A[i][k];
for(int j = 0; j < n; j++) if(B[k][j] < INF)
C[i][j] = min(C[i][j], aik + B[k][j]);
}
return C;
}
Mat minplus_pow(Mat A, unsigned long long e){
int n = (int)A.size();
Mat R(n, vector<ll>(n, INF));
for(int i = 0; i < n; i++) R[i][i] = 0; // identity: diagonal 0, rest INF
while(e){
if(e & 1ULL) R = minplus_mul(R, A);
A = minplus_mul(A, A);
e >>= 1ULL;
}
return R;
}
// A^n * v (O(n^2 log n), faster than full matrix power for vector queries)
vector<ll> minplus_pow_vec(Mat A, unsigned long long e, vector<ll> v){
int n = (int)A.size();
auto apply = [&](const Mat& M, const vector<ll>& x){
vector<ll> y(n, INF);
for(int i = 0; i < n; i++)
for(int k = 0; k < n; k++) if(M[i][k] < INF && x[k] < INF)
y[i] = min(y[i], M[i][k] + x[k]);
return y;
};
while(e){
if(e & 1ULL) v = apply(A, v);
A = minplus_mul(A, A);
e >>= 1ULL;
}
return v;
}
Type ④: Maximum Reward (max-plus semiring)
Semantics: maximum total reward of exactly $n$ steps.
Semiring: $(\mathbb{L}\cup\{-\infty\},\max,+,-\infty,0)$
using Mat = vector<vector<ll>>;
const ll NINF = -(1LL<<60);
Mat maxplus_mul(const Mat& A, const Mat& B){
int n = (int)A.size();
Mat C(n, vector<ll>(n, NINF));
for(int i = 0; i < n; i++)
for(int k = 0; k < n; k++) if(A[i][k] > NINF){
ll aik = A[i][k];
for(int j = 0; j < n; j++) if(B[k][j] > NINF)
C[i][j] = max(C[i][j], aik + B[k][j]);
}
return C;
}
Mat maxplus_pow(Mat A, unsigned long long e){
int n = (int)A.size();
Mat R(n, vector<ll>(n, NINF));
for(int i = 0; i < n; i++) R[i][i] = 0; // identity: diagonal 0, rest -INF
while(e){
if(e & 1ULL) R = maxplus_mul(R, A);
A = maxplus_mul(A, A);
e >>= 1ULL;
}
return R;
}
vector<ll> maxplus_pow_vec(Mat A, unsigned long long e, vector<ll> v){
int n = (int)A.size();
auto apply = [&](const Mat& M, const vector<ll>& x){
vector<ll> y(n, NINF);
for(int i = 0; i < n; i++)
for(int k = 0; k < n; k++) if(M[i][k] > NINF && x[k] > NINF)
y[i] = max(y[i], M[i][k] + x[k]);
return y;
};
while(e){
if(e & 1ULL) v = apply(A, v);
A = maxplus_mul(A, A);
e >>= 1ULL;
}
return v;
}
3 Matrix + Data Structures
3.1 Fibonacci Segment Tree (CF 719E — Sasha and Array)
Array $a_1,\ldots,a_n$, $m$ queries:
- Range add $x$ to $a[l..r]$
- Query $\sum_{i=l}^r F(a_i)\bmod(10^9+7)$ where $F$ is the Fibonacci function.
$(1\leq n,m\leq10^5,\ 1\leq a_i,x\leq10^9)$
Key insight: define the Fibonacci transition matrix $T=\begin{pmatrix}1&1\\1&0\end{pmatrix}$ with $T^k=\begin{pmatrix}F_{k+1}&F_k\\F_k&F_{k-1}\end{pmatrix}$.
Represent each $a_i$ as the $2\times2$ matrix $T^{a_i}$. Adding $x$ to $a_i$ means right-multiplying by $T^x$, which is the same matrix for all positions in the range.
Segment tree with matrix lazy tags:
- Node value
sum: sum of all $T^{a_i}$ in the range (matrix addition, element-wise). - Lazy tag
tag: pending right-multiplication matrix (identity initially). - push_down: $\sum T^{a_i+x} = \sum T^{a_i} \cdot T^x$, so multiply both
sumandtagof children on the right. - push_up:
sum = left.sum + right.sum(element-wise matrix addition). - Answer: query returns the sum matrix; answer is
sum.a[1][0](= $\sum F_{a_i}$).
Complexity: $O(m\log n\cdot2^3\log x)$ since each T^x costs $O(4\log x)$.
#include<bits/stdc++.h>
#define int long long
#define endl '\n'
using namespace std;
const int N = 2e5 + 10;
const int MOD = 1e9+7;
const int SZ = 2; // matrix dimension
struct Mat {
int a[SZ][SZ];
Mat(){ for(int i=0;i<SZ;i++) for(int j=0;j<SZ;j++) a[i][j]=0; }
void clear(){ for(int i=0;i<SZ;i++) for(int j=0;j<SZ;j++) a[i][j]=0; }
void init(){ clear(); for(int i=0;i<SZ;i++) a[i][i]=1; }
Mat operator+(const Mat& dx) const {
Mat ls;
for(int i=0;i<SZ;i++) for(int j=0;j<SZ;j++)
ls.a[i][j] = (a[i][j]+dx.a[i][j])%MOD;
return ls;
}
Mat operator*(const Mat& dx) const {
Mat ls;
for(int i=0;i<SZ;i++) for(int j=0;j<SZ;j++){
long long s=0;
for(int k=0;k<SZ;k++){
s += (a[i][k]*dx.a[k][j])%MOD;
if(s>=(1ll<<62)) s%=MOD;
}
ls.a[i][j]=(int)(s%MOD);
}
return ls;
}
Mat mpow(long long e) const {
Mat ls; ls.init(); Mat di=*this;
while(e){ if(e&1) ls=ls*di; e>>=1; di=di*di; }
return ls;
}
} tree[N<<2], lazy[N<<2];
int a[N];
inline Mat baseT(){ // Fibonacci transition matrix
Mat t;
t.a[0][0]=1; t.a[0][1]=1;
t.a[1][0]=1; t.a[1][1]=0;
return t;
}
void pushdown(int dq){
lazy[dq<<1] = lazy[dq<<1] * lazy[dq];
lazy[dq<<1|1] = lazy[dq<<1|1] * lazy[dq];
tree[dq<<1] = tree[dq<<1] * lazy[dq];
tree[dq<<1|1] = tree[dq<<1|1] * lazy[dq];
lazy[dq].init();
}
void pushup(int dq){ tree[dq]=tree[dq<<1]+tree[dq<<1|1]; }
void build(int l,int r,int dq){
lazy[dq].init();
if(l==r){ Mat T=baseT(); tree[dq]=T.mpow(a[l]); return; }
int m=(l+r)>>1;
build(l,m,dq<<1); build(m+1,r,dq<<1|1); pushup(dq);
}
void add(int l,int r,int ml,int mr,int dq,const Mat& zhi){
if(ml<=l && r<=mr){ lazy[dq]=lazy[dq]*zhi; tree[dq]=tree[dq]*zhi; return; }
pushdown(dq); int m=(l+r)>>1;
if(ml<=m) add(l,m,ml,mr,dq<<1,zhi);
if(mr>m) add(m+1,r,ml,mr,dq<<1|1,zhi);
pushup(dq);
}
Mat query(int l,int r,int ml,int mr,int dq){
if(ml<=l && r<=mr) return tree[dq];
pushdown(dq); int m=(l+r)>>1; Mat ls;
if(ml<=m) ls=ls+query(l,m,ml,mr,dq<<1);
if(mr>m) ls=ls+query(m+1,r,ml,mr,dq<<1|1);
return ls;
}
void solve(){
int n,m; cin>>n>>m;
for(int i=1;i<=n;i++) cin>>a[i];
build(1,n,1);
int sz,l,r; long long v; Mat T=baseT();
while(m--){
cin>>sz;
if(sz==1){ cin>>l>>r>>v; Mat M=T.mpow(v); add(1,n,l,r,1,M); }
else{ cin>>l>>r; Mat ans=query(1,n,l,r,1); cout<<(ans.a[1][0]%MOD+MOD)%MOD<<endl; }
}
}
signed main(){ ios::sync_with_stdio(false); cin.tie(0); cout.tie(0); solve(); return 0; }
3.2 Range-Add, Range sin-Sum — Luogu P6327 (Floating Point)
Array $a[]$, operations: range add $v$; query $\sum_{i=l}^r\sin(a_i)$. ($n,m\leq2\times10^5$)
Key identity: $\sin(a+v)=\sin a\cos v+\cos a\sin v$ and $\cos(a+v)=-\sin a\sin v+\cos a\cos v$.
Store each node as the $2\times2$ rotation matrix:
$$\begin{pmatrix}\cos a_i&-\sin a_i\\\sin a_i&\cos a_i\end{pmatrix}$$Adding $v$ to $a_i$ is right-multiplication by the rotation matrix $R(v)=\begin{pmatrix}\cos v&-\sin v\\\sin v&\cos v\end{pmatrix}$.
The segment tree stores matrix sums; the answer to a range-sin query is ans.a[1][0] (sum of $\sin$ components). The push-down and push-up are identical in structure to the Fibonacci case but with floating-point matrices.
#include<bits/stdc++.h>
#define int long long
#define endl '\n'
using namespace std;
const int N = 2e5 + 10;
const int SZ = 2;
struct Mat {
double a[SZ][SZ];
Mat(){ for(int i=0;i<SZ;i++) for(int j=0;j<SZ;j++) a[i][j]=0; }
void init(){ for(int i=0;i<SZ;i++) for(int j=0;j<SZ;j++) a[i][j]=(i==j)?1:0; }
Mat operator+(const Mat& dx){ Mat ls; for(int i=0;i<SZ;i++) for(int j=0;j<SZ;j++) ls.a[i][j]=a[i][j]+dx.a[i][j]; return ls; }
Mat operator*(const Mat& dx){ Mat ls; for(int i=0;i<SZ;i++) for(int j=0;j<SZ;j++) for(int k=0;k<SZ;k++) ls.a[i][j]+=a[i][k]*dx.a[k][j]; return ls; }
Mat mpow(long long e) const { Mat ls; ls.init(); Mat di=*this; while(e){ if(e&1) ls=ls*di; e>>=1; di=di*di; } return ls; }
} tree[N<<2], lazy[N<<2];
int a[N];
void pushdown(int dq){ lazy[dq<<1]=lazy[dq<<1]*lazy[dq]; lazy[dq<<1|1]=lazy[dq<<1|1]*lazy[dq]; tree[dq<<1]=tree[dq<<1]*lazy[dq]; tree[dq<<1|1]=tree[dq<<1|1]*lazy[dq]; lazy[dq].init(); }
void pushup(int dq){ tree[dq]=tree[dq<<1]+tree[dq<<1|1]; }
void build(int l,int r,int dq){
lazy[dq].init();
if(l==r){ tree[dq].a[0][0]=cos(a[l]); tree[dq].a[0][1]=-sin(a[l]); tree[dq].a[1][0]=sin(a[l]); tree[dq].a[1][1]=cos(a[l]); return; }
int ed=l+r>>1; build(l,ed,dq<<1); build(ed+1,r,dq<<1|1); pushup(dq);
}
void add(int l,int r,int ml,int mr,int dq,Mat zhi){
if(ml<=l&&r<=mr){ lazy[dq]=lazy[dq]*zhi; tree[dq]=tree[dq]*zhi; return; }
pushdown(dq); int ed=l+r>>1;
if(ml<=ed) add(l,ed,ml,mr,dq<<1,zhi);
if(mr>ed) add(ed+1,r,ml,mr,dq<<1|1,zhi);
pushup(dq);
}
Mat query(int l,int r,int ml,int mr,int dq){
if(ml<=l&&r<=mr) return tree[dq];
pushdown(dq); int ed=l+r>>1; Mat ls;
if(ml<=ed) ls=ls+query(l,ed,ml,mr,dq<<1);
if(mr>ed) ls=ls+query(ed+1,r,ml,mr,dq<<1|1);
return ls;
}
void solve(){
int n; cin>>n;
for(int i=1;i<=n;i++) cin>>a[i];
build(1,n,1);
int m; cin>>m; int sz,l,r,v;
while(m--){
cin>>sz;
if(sz==1){ cin>>l>>r>>v; Mat ls; ls.a[0][0]=cos(v); ls.a[0][1]=-sin(v); ls.a[1][0]=sin(v); ls.a[1][1]=cos(v); add(1,n,l,r,1,ls); }
else{ cin>>l>>r; auto ans=query(1,n,l,r,1); cout<<ans.a[1][0]<<endl; }
}
}
signed main(){ ios::sync_with_stdio(false); cin.tie(0); cout.tie(0); cout<<fixed<<setprecision(6); solve(); return 0; }
Matrix + data structure pattern:
- Expand the maintained values from scalars to matrices.
- push_down: $\sum\vec{F}(a_i+x)=M^x\sum\vec{F}(a_i)$ — the same constant matrix applies to the whole sum.
- push_up: simple matrix (or vector) addition.
When a lazy tag’s push_down is hard to implement, consider whether extending to matrices unlocks it.
4 Kitamasa — Faster Linear Recurrence
Compute $a_n$ for $a_n=c_1a_{n-1}+\cdots+c_ka_{n-k}$ in $O(k^2\log n)$ time and $O(k)$ space — better than matrix exponentiation’s $O(k^3\log n)$ for large $k$.
Algorithm
Represent the answer as a linear combination of initial values: $a_n=\sum_{i=0}^{k-1}r_i a_i$.
The coefficient vector $r(x)\bmod P(x)$ (where $P(x)=x^k-c_1x^{k-1}-\cdots-c_k$ is the characteristic polynomial) is maintained via polynomial fast power modulo $P(x)$.
Implementation — $O(k^2\log n)$
#include <bits/stdc++.h>
using namespace std;
using i64 = long long;
using i128 = __int128_t;
// a_n = c1*a_{n-1} + ... + ck*a_{n-k}
// init = [a0, a1, ..., a_{k-1}], coef = [c1, c2, ..., ck]
i64 kitamasa(vector<i64> init, vector<i64> coef, long long n, i64 mod){
int k = (int)coef.size();
if(n < (int)init.size()){ i64 x=init[(int)n]%mod; if(x<0) x+=mod; return x; }
for(auto &x:init){ x%=mod; if(x<0) x+=mod; }
for(auto &x:coef){ x%=mod; if(x<0) x+=mod; }
if(k==0) return 0;
if(k==1){
i64 base=coef[0], pw=1%mod; long long e=n;
while(e){ if(e&1) pw=(i64)((i128)pw*base%mod); e>>=1; if(e) base=(i64)((i128)base*base%mod); }
return (i64)((i128)init[0]*pw%mod);
}
auto mul = [&](vector<i64> A, vector<i64> B){
vector<i64> C(k);
for(int t=0;t<k;++t){
i64 v=A[t];
if(v){ for(int j=0;j<k;++j) C[j]=(C[j]+(i64)((i128)v*B[j]%mod))%mod; }
i64 bk=B[k-1];
for(int i=k-1;i>0;--i) B[i]=(B[i-1]+(i64)((i128)bk*coef[i]%mod))%mod;
B[0]=(i64)((i128)bk*coef[0]%mod);
}
return C;
};
vector<i64> resC(k,0), cur(k,0);
resC[0]=1%mod; cur[1]=1%mod; // resC=1 (constant), cur=x
long long e=n;
while(e){ if(e&1) resC=mul(cur,resC); e>>=1; if(e) cur=mul(cur,cur); }
i128 acc=0;
for(int i=0;i<k;++i) acc=(acc+(i128)resC[i]*init[i])%mod;
i64 ans=(i64)acc%mod; if(ans<0) ans+=mod;
return ans;
}
/*** usage
int main(){
// Fibonacci: a_n = a_{n-1} + a_{n-2}, a_0=0, a_1=1
vector<i64> init={0,1}, coef={1,1};
i64 mod=1000000007LL;
cout << kitamasa(init, coef, 50, mod) << "\n"; // F_50
}
***/
NTT-based version — $O(k\log^2 k\cdot\log n)$
Uses polynomial modular reduction ($x^n\bmod P(x)$) via NTT:
// prerequisite: qp, ntt, multiply, polyInv are implemented
void trim(vector<int>& a){ while(!a.empty()&&a.back()==0) a.pop_back(); }
vector<int> polyInv(const vector<int>& f, int m){
vector<int> g(1, qp(f[0])); // g0 = 1/f[0]
int len=1;
while(len<m){
int need=min(len<<1,m);
vector<int> fcut(min((int)f.size(),need));
for(int i=0;i<(int)fcut.size();++i) fcut[i]=f[i];
auto fg=multiply(fcut,g); fg.resize(need);
vector<int> t(need);
t[0]=(2-(fg.size()?fg[0]:0))%M; if(t[0]<0) t[0]+=M;
for(int i=1;i<need;++i){ int v=(i<(int)fg.size()?fg[i]:0); t[i]=v?(M-v):0; }
auto gnew=multiply(g,t); gnew.resize(need); g.swap(gnew); len=need;
}
g.resize(m); return g;
}
vector<int> polyMod(vector<int> A, const vector<int>& P, const vector<int>& inv_revP){
int k=(int)P.size()-1; trim(A);
if((int)A.size()<=k){ A.resize(k); return A; }
int n=(int)A.size()-1, m=n-k+1;
vector<int> rA(m); for(int i=0;i<m;++i) rA[i]=A[n-i];
vector<int> inv(m); for(int i=0;i<m;++i) inv[i]=inv_revP[i];
auto qrev_full=multiply(rA,inv); qrev_full.resize(m);
vector<int> Q(m); for(int i=0;i<m;++i) Q[i]=qrev_full[m-1-i];
auto QP=multiply(Q,P);
vector<int> R(max((int)A.size(),(int)QP.size()));
for(int i=0;i<(int)A.size();++i) R[i]=(R[i]+A[i])%M;
for(int i=0;i<(int)QP.size();++i){ R[i]=(R[i]-QP[i])%M; if(R[i]<0) R[i]+=M; }
R.resize(k); return R;
}
vector<int> pow_x_modP_ntt(long long n, const vector<int>& P, const vector<int>& inv_revP){
int k=(int)P.size()-1;
vector<int> res(1,1), base={0,1};
base=polyMod(base,P,inv_revP);
while(n){
if(n&1){ res=multiply(res,base); res=polyMod(res,P,inv_revP); }
n>>=1;
if(n){ base=multiply(base,base); base=polyMod(base,P,inv_revP); }
}
res.resize(k); return res;
}
// main interface: a_n = c1 a_{n-1} + ... + ck a_{n-k}
// init=[a0..a_{k-1}], coef=[c1..ck]
int kitamasa_ntt(const vector<int>& init, const vector<int>& coef, long long n){
int k=(int)coef.size();
if(n<(long long)init.size()) return (init[(int)n]%M+M)%M;
vector<int> P(k+1,0); P[k]=1;
for(int i=0;i<k;++i){ int ci=coef[i]%M; if(ci<0) ci+=M; P[k-1-i]=(M-ci)%M; }
vector<int> revP(k+1); for(int i=0;i<=k;++i) revP[i]=P[k-i];
auto inv_revP=polyInv(revP,k);
auto R=pow_x_modP_ntt(n,P,inv_revP);
long long ans=0;
for(int i=0;i<k;++i){ long long a=(i<(int)init.size()?init[i]:0); if(a<0) a+=M; ans=(ans+a*1ll*R[i])%M; }
return (int)ans;
}
/*** usage
int main(){
vector<int> init={0,1}, coef={1,1};
ll n; cin>>n;
cout << kitamasa_ntt(init, coef, n) << "\n";
}
***/