"""Intel MKL Pardiso via ``pypardiso``.
Pardiso is multi-threaded and wins big on problems ≳30k DOF. Requires
Intel MKL, which pypardiso pulls in as a dependency.
The ``import pypardiso`` call eagerly loads MKL (~200 ms on first touch),
so this module defers the import until a PardisoSolver is actually
constructed. That keeps ``import femorph_solver`` snappy for users who don't
touch Pardiso, and avoids MKL init penalising the CHOLMOD/SuperLU paths
on small models where the auto-selector never reaches Pardiso.
When the caller asserts ``assume_spd=True`` we switch to MKL's real-SPD
mode (``mtype=2``) which runs a sparse Cholesky factor. On a
57 k-DOF ptr.cdb ``K_ff``: factor 0.39 s / solve 43 ms vs the default
unsymmetric-LU path's 0.51 s / 94 ms — roughly **2× faster** solves
with identical output (max abs diff 5 × 10⁻¹⁶).
References
----------
- Schenk, O. & Gärtner, K. "Solving unsymmetric sparse systems of
linear equations with PARDISO." Future Generation Computer Systems
20 (3), 475-487 (2004). [the PARDISO algorithm Intel MKL ships]
- Schenk, O. & Gärtner, K. "On fast factorization pivoting methods
for sparse symmetric indefinite systems." ETNA 23, 158-179 (2006).
[supernodal + Bunch-Kaufman pivoting paths used for indefinite
systems (``mtype=-2``)]
- Intel. *Intel oneAPI Math Kernel Library Developer Reference*,
"Intel oneMKL PARDISO — Parallel Direct Sparse Solver Interface."
[``iparm`` flag semantics the ``DirectMklPardisoSolver`` sibling
wrapper drives directly via ctypes]
"""
from __future__ import annotations
import contextlib
import importlib.util
from typing import ClassVar
import numpy as np
import scipy.sparse as sp
from ._base import LinearSolverBase
#: Cheap, side-effect-free availability probe — does not load MKL.
_HAVE: bool = importlib.util.find_spec("pypardiso") is not None
#: Auto cutoff for the smart ``mixed_precision=None`` default. Kept
#: for compatibility with callers that introspect it — **not used by
#: the current auto policy**, which defers to double precision
#: unconditionally after measuring that pypardiso's ``iparm(28)=1``
#: actually **increases** peak RSS by ~900 MB on a 160×160×2 plate
#: (confirmed against the linked MKL build; the single-precision
#: factor path isn't engaged — MKL seems to allocate the mixed-
#: precision refinement buffers on top of the double-precision
#: factor). Re-enable via explicit ``mixed_precision=True`` if your
#: local MKL/pypardiso build behaves differently.
_MP_AUTO_MIN_DOF = 50_000
[docs]
class PardisoSolver(LinearSolverBase):
"""Intel MKL Pardiso via pypardiso (multi-threaded)."""
name: ClassVar[str] = "pardiso"
kind: ClassVar[str] = "direct"
spd_only: ClassVar[bool] = False
install_hint: ClassVar[str] = "`pip install pypardiso` (pulls in MKL)"
[docs]
@staticmethod
def available() -> bool:
return _HAVE
def _resolve_mixed_precision(self) -> bool:
"""Decide whether MKL Pardiso should use mixed-precision factorisation.
Honours the three-valued ``mixed_precision`` kwarg:
* explicit ``True`` → request MP unconditionally (caller accepts
the caveats below);
* explicit ``False`` (**default**) → stay double-precision;
* ``None`` → auto. Currently resolves to ``False`` — measured
against the linked MKL build, pypardiso's ``iparm(28)=1``
does not engage the single-precision factor path and instead
allocates the refinement buffers on top of the double
factor (peak RSS grew by ~900 MB on a 160×160×2 plate).
The auto policy will opt in once we validate a MKL build
where MP actually reduces peak.
"""
# Both explicit False and auto (None) stay double precision
# until a validated MP path lands; only explicit True requests MP.
return self._mixed_precision is True
def _factor(self, A):
import pypardiso
# SPD fast path: MKL Pardiso ``mtype=2`` expects the upper
# triangular part of A (CSR) and runs a sparse Cholesky factor
# — ~2× faster than the default unsymmetric LU (``mtype=11``)
# on every solve, and halves the factor memory. pypardiso's
# wrapper only tests mtype=11 out of the box (and segfaults if
# you pass the full symmetric matrix to mtype=2), so we extract
# ``sp.triu(A)`` ourselves before handing it over.
#
# Callers that already built A as upper-triangular can set
# ``A.femorph_triu = True`` on the csr_array; we then skip the
# ``sp.triu`` copy (saves ~n_free × 12 B — 130 MB on a
# 200×200×2 plate K).
# ``size_limit_storage=0`` forces pypardiso down its hash-only
# cache path. The default (``5e7`` nnz) stashes a full ``A.copy()``
# on the solver for every problem smaller than 50 M nnz — a 200×200×2
# HEX8 triu K_ff is ~8 M nnz / ~85 MB, which means the default
# holds a second copy of K_ff for the lifetime of the solver.
# We monkey-patch the ``_is_already_factorized`` check below so the
# stored hash is never read anyway — setting the limit to zero
# gets us the same short-circuit with zero duplicated storage.
# Saves ~85 MB at 200×200×2 on the shift-invert modal path.
if self._assume_spd:
if getattr(A, "femorph_triu", False):
# Contract: already triu, CSR, float64 — the assembler guarantees it.
self._A = A
else:
self._A = sp.triu(A, format="csr").astype(np.float64)
self._solver = pypardiso.PyPardisoSolver(mtype=2, size_limit_storage=0)
else:
self._A = A.tocsr().astype(np.float64)
self._solver = pypardiso.PyPardisoSolver(size_limit_storage=0)
# iparm[25] = 1 enables MKL's level-set parallel forward/back
# substitution; the default is sequential. On 8 threads this
# saves ~3 % per solve on a 600 k-DOF K and has no downside on
# smaller problems.
self._solver.set_iparm(25, 1)
# Mixed-precision factor (iparm(28)=1). Single-precision
# factorisation halves the factor footprint; iterative
# refinement with double-precision residuals brings the solve
# back to ~double accuracy. Only useful for well-conditioned
# SPD systems (SPD Cholesky rarely needs pivoting so the
# refinement converges in 1–3 iterations).
#
# Caveats measured on this MKL build:
# * pypardiso's author explicitly warns that iparm(28) and
# iparm(60) are unreliable; mileage varies by MKL version.
# * Some builds silently no-op the request (same iparm(17)
# factor-size report) — callers that need a guaranteed
# single-precision factor should probe ``iparm(28)`` after
# factorize() and fall back if it came back 0.
mp_requested = self._resolve_mixed_precision()
if mp_requested:
self._solver.set_iparm(28, 1)
self._solver.factorize(self._A)
# pypardiso.solve(A, b) SHA-1-hashes the full CSR on every call to
# detect whether the factorisation is still valid. Our wrapper
# owns the solver instance and the matrix never changes between
# factor and subsequent solves, so the check is pure overhead —
# ~10 % of wall time inside ARPACK shift-invert on ptr.cdb-size
# problems. Monkey-patch it to always short-circuit to "yes".
self._solver._is_already_factorized = lambda _A: True
# Cache the 1-based CSR index arrays MKL Pardiso expects. The
# stock ``PyPardisoSolver._call_pardiso`` rebuilds them on every
# solve (``A.indptr.astype(int32) + 1`` and the same for
# indices) — that's ~4 × nnz bytes of alloc/free *per solve*.
# An ARPACK shift-invert typically fires 30-40 solves per
# ``modal_solve``, so at 150×150×2 HEX8 triu K (~16 M nnz)
# this wastes ~2 GB of transient allocation over the
# converged run. pypardiso's own path accepts the churn; we
# own the factor + A for the solver's lifetime and can cache
# once. Installed as a per-instance ``_call_pardiso`` override
# so we reuse every MKL-facing check and argument except the
# two cached buffers.
self._install_cached_call_pardiso()
# Pre-flip to solve-only phase. ``pypardiso.solve`` otherwise
# calls ``set_phase(33)`` on every solve; we do it once here
# because the matrix is frozen from now on. This is also the
# precondition that lets ``solve()`` below skip
# ``pypardiso.solve``'s per-call ``_check_A`` /
# ``_check_b`` — both run an O(n) scan per invocation
# (``np.diff(A.indptr).all()`` plus a dense-format coercion
# on ``b``). Our factor invariant guarantees they'd succeed,
# so short-circuit straight to the cached ``_call_pardiso``.
self._solver.set_phase(33)
def _install_cached_call_pardiso(self) -> None:
import ctypes # noqa: PLC0415 (local to isolate from top-level numpy/scipy imports)
import types # noqa: PLC0415
ia_p1 = np.ascontiguousarray(self._A.indptr, dtype=np.int32) + np.int32(1)
ja_p1 = np.ascontiguousarray(self._A.indices, dtype=np.int32) + np.int32(1)
self._ia_p1 = ia_p1 # keep a reference so the buffers outlive the closure
self._ja_p1 = ja_p1
solver = self._solver
mkl_pardiso = solver._mkl_pardiso
pt_type_ctype = solver._pt_type[0]
c_int32_p = ctypes.POINTER(ctypes.c_int32)
c_float64_p = ctypes.POINTER(ctypes.c_double)
def _call_pardiso_cached(self_solver, A, b): # noqa: ANN001 — pypardiso signature
x = np.zeros_like(b)
pardiso_error = ctypes.c_int32(0)
mkl_pardiso(
self_solver.pt.ctypes.data_as(ctypes.POINTER(pt_type_ctype)),
ctypes.byref(ctypes.c_int32(1)),
ctypes.byref(ctypes.c_int32(1)),
ctypes.byref(ctypes.c_int32(self_solver.mtype)),
ctypes.byref(ctypes.c_int32(self_solver.phase)),
ctypes.byref(ctypes.c_int32(A.shape[0])),
A.data.ctypes.data_as(c_float64_p),
ia_p1.ctypes.data_as(c_int32_p),
ja_p1.ctypes.data_as(c_int32_p),
self_solver.perm.ctypes.data_as(c_int32_p),
ctypes.byref(ctypes.c_int32(1 if b.ndim == 1 else b.shape[1])),
self_solver.iparm.ctypes.data_as(c_int32_p),
ctypes.byref(ctypes.c_int32(self_solver.msglvl)),
b.ctypes.data_as(c_float64_p),
x.ctypes.data_as(c_float64_p),
ctypes.byref(pardiso_error),
)
if pardiso_error.value != 0:
from pypardiso.pardiso_wrapper import PyPardisoError # noqa: PLC0415
raise PyPardisoError(pardiso_error.value)
return np.ascontiguousarray(x)
solver._call_pardiso = types.MethodType(_call_pardiso_cached, solver)
[docs]
def solve(self, b: np.ndarray) -> np.ndarray:
b = np.ascontiguousarray(b, dtype=np.float64)
# Skip ``pypardiso.solve``'s preamble — every one of its checks
# (``_check_A`` scans ``np.diff(A.indptr).all()`` at O(n)/solve,
# ``_check_b`` coerces dense-fortran layout, ``set_phase(33)``
# rewrites a scalar) holds true by our factor invariant. At
# 30-40 ARPACK iterations per ``modal_solve`` on a 200k-DOF
# triu K the preamble adds up to ~5% of eigsh wall time.
return self._solver._call_pardiso(self._A, b)
[docs]
def release(self) -> None:
"""Release MKL's internal factor memory.
MKL Pardiso holds the numerical factor on a 64-slot ``pt``
handle array that survives Python-side garbage collection —
dropping our :class:`PardisoSolver` reference does not free
it. ``pypardiso.free_memory(everything=True)`` runs phase
``-1``, which tears down the MKL side. On a 200×200×2
HEX8 plate that's ~390 MB of permanent factor that would
otherwise live until process exit; for workflows that run
multiple solves (modal + static + modal, notebooks iterating
over meshes) the leak accumulates linearly.
Idempotent — safe to call multiple times. Safe to call
mid-interpreter-shutdown because ``pypardiso.free_memory``
constructs a tiny dummy ``A`` / ``b`` locally and the MKL
library handle is already resolved.
"""
solver = getattr(self, "_solver", None)
if solver is None:
return
# Best-effort: if MKL or pypardiso is already torn down
# (e.g. interpreter shutdown), swallow — the OS will
# reclaim everything momentarily anyway.
with contextlib.suppress(Exception):
solver.free_memory(everything=True)
self._solver = None
self._A = None
def __del__(self) -> None:
# GC hook so callers that simply drop the reference (the common
# case — ARPACK wraps us in a ``LinearOperator`` that goes out
# of scope when the eigensolve returns) still see MKL release.
# ``release`` is idempotent + shutdown-safe.
self.release()