Source code for femorph_solver.solvers.linear._pardiso

"""Intel MKL Pardiso via ``pypardiso``.

Pardiso is multi-threaded and wins big on problems ≳30k DOF.  Requires
Intel MKL, which pypardiso pulls in as a dependency.

The ``import pypardiso`` call eagerly loads MKL (~200 ms on first touch),
so this module defers the import until a PardisoSolver is actually
constructed.  That keeps ``import femorph_solver`` snappy for users who don't
touch Pardiso, and avoids MKL init penalising the CHOLMOD/SuperLU paths
on small models where the auto-selector never reaches Pardiso.

When the caller asserts ``assume_spd=True`` we switch to MKL's real-SPD
mode (``mtype=2``) which runs a sparse Cholesky factor.  On a
57 k-DOF ptr.cdb ``K_ff``: factor 0.39 s / solve 43 ms vs the default
unsymmetric-LU path's 0.51 s / 94 ms — roughly **2× faster** solves
with identical output (max abs diff 5 × 10⁻¹⁶).

References
----------
- Schenk, O. & Gärtner, K.  "Solving unsymmetric sparse systems of
  linear equations with PARDISO."  Future Generation Computer Systems
  20 (3), 475-487 (2004).  [the PARDISO algorithm Intel MKL ships]
- Schenk, O. & Gärtner, K.  "On fast factorization pivoting methods
  for sparse symmetric indefinite systems."  ETNA 23, 158-179 (2006).
  [supernodal + Bunch-Kaufman pivoting paths used for indefinite
  systems (``mtype=-2``)]
- Intel.  *Intel oneAPI Math Kernel Library Developer Reference*,
  "Intel oneMKL PARDISO — Parallel Direct Sparse Solver Interface."
  [``iparm`` flag semantics the ``DirectMklPardisoSolver`` sibling
  wrapper drives directly via ctypes]
"""

from __future__ import annotations

import contextlib
import importlib.util
from typing import ClassVar

import numpy as np
import scipy.sparse as sp

from ._base import LinearSolverBase

#: Cheap, side-effect-free availability probe — does not load MKL.
_HAVE: bool = importlib.util.find_spec("pypardiso") is not None

#: Auto cutoff for the smart ``mixed_precision=None`` default.  Kept
#: for compatibility with callers that introspect it — **not used by
#: the current auto policy**, which defers to double precision
#: unconditionally after measuring that pypardiso's ``iparm(28)=1``
#: actually **increases** peak RSS by ~900 MB on a 160×160×2 plate
#: (confirmed against the linked MKL build; the single-precision
#: factor path isn't engaged — MKL seems to allocate the mixed-
#: precision refinement buffers on top of the double-precision
#: factor).  Re-enable via explicit ``mixed_precision=True`` if your
#: local MKL/pypardiso build behaves differently.
_MP_AUTO_MIN_DOF = 50_000



[docs]
class PardisoSolver(LinearSolverBase):
    """Intel MKL Pardiso via pypardiso (multi-threaded)."""

    name: ClassVar[str] = "pardiso"
    kind: ClassVar[str] = "direct"
    spd_only: ClassVar[bool] = False
    install_hint: ClassVar[str] = "`pip install pypardiso` (pulls in MKL)"


[docs]
    @staticmethod
    def available() -> bool:
        return _HAVE


    def _resolve_mixed_precision(self) -> bool:
        """Decide whether MKL Pardiso should use mixed-precision factorisation.

        Honours the three-valued ``mixed_precision`` kwarg:

        * explicit ``True`` → request MP unconditionally (caller accepts
          the caveats below);
        * explicit ``False`` (**default**) → stay double-precision;
        * ``None`` → auto.  Currently resolves to ``False`` — measured
          against the linked MKL build, pypardiso's ``iparm(28)=1``
          does not engage the single-precision factor path and instead
          allocates the refinement buffers on top of the double
          factor (peak RSS grew by ~900 MB on a 160×160×2 plate).
          The auto policy will opt in once we validate a MKL build
          where MP actually reduces peak.
        """
        # Both explicit False and auto (None) stay double precision
        # until a validated MP path lands; only explicit True requests MP.
        return self._mixed_precision is True

    def _factor(self, A):
        import pypardiso

        # Complex CSR? Two routes:
        #   * ``assume_hermitian=True`` → MKL ``mtype=-4`` (complex
        #     Hermitian indefinite, Bunch-Kaufman pivoting).  Expects
        #     upper-triangular CSR — half the data + analysis cost vs
        #     the unsymmetric branch on a Hermitian matrix.  We use
        #     ``-4`` rather than ``+4`` (Hermitian *positive-definite*)
        #     because the cyclic-reduced K_red is generally indefinite
        #     at intermediate harmonics; ``+4`` would Cholesky-fail
        #     with error -4 the first time it hit a negative pivot.
        #   * otherwise → MKL ``mtype=13`` (complex unsymmetric).
        # pypardiso's high-level ``factorize`` / ``solve`` reject
        # non-float64 in ``_check_A`` / ``_check_b``, so we drive MKL
        # directly through the cached ``_call_pardiso`` for both the
        # initial phase-12 and subsequent phase-33 calls.
        if np.iscomplexobj(A.data) if hasattr(A, "data") else np.iscomplexobj(A):
            self._complex = True
            if self._assume_hermitian:
                if getattr(A, "femorph_triu", False):
                    # Caller already produced the upper-triangular CSR.
                    self._A = A.astype(np.complex128)
                else:
                    self._A = sp.triu(A, format="csr").astype(np.complex128)
                self._mtype = -4
            else:
                self._A = A.tocsr().astype(np.complex128)
                self._mtype = 13
            self._solver = pypardiso.PyPardisoSolver(mtype=self._mtype, size_limit_storage=0)
            # Leave iparm at the all-zeros initial state — MKL Pardiso
            # treats ``iparm[0] == 0`` as "use internal defaults", which
            # is what we want for both ``mtype=13`` (proven path from
            # #700) and ``mtype=-4`` (Hermitian indefinite — MKL's
            # internal defaults wire in METIS reorder + symmetric
            # weighted matching automatically).  Calling ``pardisoinit``
            # to populate iparm is redundant *and* observed to crash
            # ARPACK Lanczos on some MKL builds when later phases see a
            # changed iparm; the all-zeros sentinel is the most portable
            # configuration.  The one explicit override is iparm[24] =
            # parallel forward/back substitution, which is safe across
            # mtypes.
            self._solver.set_iparm(25, 1)  # iparm[24] = 1: parallel solve
            # Skip pypardiso.factorize (its _check_A rejects complex);
            # set phase 12 and call MKL directly with a dummy complex b.
            self._install_cached_call_pardiso()
            self._solver._is_already_factorized = lambda _A: True
            self._solver.set_phase(12)
            self._solver._call_pardiso(
                self._A, np.zeros((self._A.shape[0], 1), dtype=np.complex128)
            )
            self._solver.set_phase(33)
            return

        self._complex = False

        # SPD fast path: MKL Pardiso ``mtype=2`` expects the upper
        # triangular part of A (CSR) and runs a sparse Cholesky factor
        # — ~2× faster than the default unsymmetric LU (``mtype=11``)
        # on every solve, and halves the factor memory.  pypardiso's
        # wrapper only tests mtype=11 out of the box (and segfaults if
        # you pass the full symmetric matrix to mtype=2), so we extract
        # ``sp.triu(A)`` ourselves before handing it over.
        #
        # Callers that already built A as upper-triangular can set
        # ``A.femorph_triu = True`` on the csr_array; we then skip the
        # ``sp.triu`` copy (saves ~n_free × 12 B — 130 MB on a
        # 200×200×2 plate K).
        # ``size_limit_storage=0`` forces pypardiso down its hash-only
        # cache path.  The default (``5e7`` nnz) stashes a full ``A.copy()``
        # on the solver for every problem smaller than 50 M nnz — a 200×200×2
        # HEX8 triu K_ff is ~8 M nnz / ~85 MB, which means the default
        # holds a second copy of K_ff for the lifetime of the solver.
        # We monkey-patch the ``_is_already_factorized`` check below so the
        # stored hash is never read anyway — setting the limit to zero
        # gets us the same short-circuit with zero duplicated storage.
        # Saves ~85 MB at 200×200×2 on the shift-invert modal path.
        if self._assume_spd:
            if getattr(A, "femorph_triu", False):
                # Contract: already triu, CSR, float64 — the assembler guarantees it.
                self._A = A
            else:
                self._A = sp.triu(A, format="csr").astype(np.float64)
            self._solver = pypardiso.PyPardisoSolver(mtype=2, size_limit_storage=0)
        else:
            self._A = A.tocsr().astype(np.float64)
            self._solver = pypardiso.PyPardisoSolver(size_limit_storage=0)
        # iparm[25] = 1 enables MKL's level-set parallel forward/back
        # substitution; the default is sequential.  On 8 threads this
        # saves ~3 % per solve on a 600 k-DOF K and has no downside on
        # smaller problems.
        self._solver.set_iparm(25, 1)

        # Mixed-precision factor (iparm(28)=1).  Single-precision
        # factorisation halves the factor footprint; iterative
        # refinement with double-precision residuals brings the solve
        # back to ~double accuracy.  Only useful for well-conditioned
        # SPD systems (SPD Cholesky rarely needs pivoting so the
        # refinement converges in 1–3 iterations).
        #
        # Caveats measured on this MKL build:
        #  * pypardiso's author explicitly warns that iparm(28) and
        #    iparm(60) are unreliable; mileage varies by MKL version.
        #  * Some builds silently no-op the request (same iparm(17)
        #    factor-size report) — callers that need a guaranteed
        #    single-precision factor should probe ``iparm(28)`` after
        #    factorize() and fall back if it came back 0.
        mp_requested = self._resolve_mixed_precision()
        if mp_requested:
            self._solver.set_iparm(28, 1)
        self._solver.factorize(self._A)
        # pypardiso.solve(A, b) SHA-1-hashes the full CSR on every call to
        # detect whether the factorisation is still valid.  Our wrapper
        # owns the solver instance and the matrix never changes between
        # factor and subsequent solves, so the check is pure overhead —
        # ~10 % of wall time inside ARPACK shift-invert on ptr.cdb-size
        # problems.  Monkey-patch it to always short-circuit to "yes".
        self._solver._is_already_factorized = lambda _A: True

        # Cache the 1-based CSR index arrays MKL Pardiso expects.  The
        # stock ``PyPardisoSolver._call_pardiso`` rebuilds them on every
        # solve (``A.indptr.astype(int32) + 1`` and the same for
        # indices) — that's ~4 × nnz bytes of alloc/free *per solve*.
        # An ARPACK shift-invert typically fires 30-40 solves per
        # ``modal_solve``, so at 150×150×2 HEX8 triu K (~16 M nnz)
        # this wastes ~2 GB of transient allocation over the
        # converged run.  pypardiso's own path accepts the churn; we
        # own the factor + A for the solver's lifetime and can cache
        # once.  Installed as a per-instance ``_call_pardiso`` override
        # so we reuse every MKL-facing check and argument except the
        # two cached buffers.
        self._install_cached_call_pardiso()

        # Pre-flip to solve-only phase.  ``pypardiso.solve`` otherwise
        # calls ``set_phase(33)`` on every solve; we do it once here
        # because the matrix is frozen from now on.  This is also the
        # precondition that lets ``solve()`` below skip
        # ``pypardiso.solve``'s per-call ``_check_A`` /
        # ``_check_b`` — both run an O(n) scan per invocation
        # (``np.diff(A.indptr).all()`` plus a dense-format coercion
        # on ``b``).  Our factor invariant guarantees they'd succeed,
        # so short-circuit straight to the cached ``_call_pardiso``.
        self._solver.set_phase(33)

    def _install_cached_call_pardiso(self) -> None:
        import ctypes  # noqa: PLC0415  (local to isolate from top-level numpy/scipy imports)
        import types  # noqa: PLC0415

        ia_p1 = np.ascontiguousarray(self._A.indptr, dtype=np.int32) + np.int32(1)
        ja_p1 = np.ascontiguousarray(self._A.indices, dtype=np.int32) + np.int32(1)
        self._ia_p1 = ia_p1  # keep a reference so the buffers outlive the closure
        self._ja_p1 = ja_p1
        solver = self._solver
        mkl_pardiso = solver._mkl_pardiso
        pt_type_ctype = solver._pt_type[0]
        c_int32_p = ctypes.POINTER(ctypes.c_int32)
        c_float64_p = ctypes.POINTER(ctypes.c_double)

        # Pre-bind the constants Pardiso never sees change across solves
        # (maxfct, mnum, nrhs=1 for 1-D b, n).  Without this each call
        # walks ``ctypes.byref(ctypes.c_int32(...))`` 3-5 times — small
        # but it adds up over ~50 ARPACK iters/harmonic × 9 harmonics.
        # The per-call dynamic args are the dtype-dependent A.data, b,
        # x pointers and the iparm/perm/pt array data pointers (whose
        # backing memory may relocate between calls).
        maxfct_ref = ctypes.byref(ctypes.c_int32(1))
        mnum_ref = ctypes.byref(ctypes.c_int32(1))
        nrhs1_ref = ctypes.byref(ctypes.c_int32(1))
        n_ref = ctypes.byref(ctypes.c_int32(self._A.shape[0]))

        def _call_pardiso_cached(self_solver, A, b):  # noqa: ANN001 — pypardiso signature
            # ``np.empty_like(b)`` instead of pypardiso's stock
            # ``np.zeros_like(b)``: Pardiso phase 33 writes the full
            # output, so the memset is dead work — at 488 k complex
            # DOFs it costs an 8 MB zero-fill per ARPACK iteration
            # (~3.6 GB of pointless writes across the 9-harmonic
            # cyclic sweep on the bench rotor).
            x = np.empty_like(b)
            pardiso_error = ctypes.c_int32(0)
            nrhs_ref = nrhs1_ref if b.ndim == 1 else ctypes.byref(ctypes.c_int32(b.shape[1]))
            mkl_pardiso(
                self_solver.pt.ctypes.data_as(ctypes.POINTER(pt_type_ctype)),
                maxfct_ref,
                mnum_ref,
                ctypes.byref(ctypes.c_int32(self_solver.mtype)),
                ctypes.byref(ctypes.c_int32(self_solver.phase)),
                n_ref,
                A.data.ctypes.data_as(c_float64_p),
                ia_p1.ctypes.data_as(c_int32_p),
                ja_p1.ctypes.data_as(c_int32_p),
                self_solver.perm.ctypes.data_as(c_int32_p),
                nrhs_ref,
                self_solver.iparm.ctypes.data_as(c_int32_p),
                ctypes.byref(ctypes.c_int32(self_solver.msglvl)),
                b.ctypes.data_as(c_float64_p),
                x.ctypes.data_as(c_float64_p),
                ctypes.byref(pardiso_error),
            )
            if pardiso_error.value != 0:
                from pypardiso.pardiso_wrapper import PyPardisoError  # noqa: PLC0415

                raise PyPardisoError(pardiso_error.value)
            return x

        solver._call_pardiso = types.MethodType(_call_pardiso_cached, solver)


[docs]
    def solve(self, b: np.ndarray) -> np.ndarray:
        dtype = np.complex128 if getattr(self, "_complex", False) else np.float64
        b = np.ascontiguousarray(b, dtype=dtype)
        # Skip ``pypardiso.solve``'s preamble — every one of its checks
        # (``_check_A`` scans ``np.diff(A.indptr).all()`` at O(n)/solve,
        # ``_check_b`` coerces dense-fortran layout, ``set_phase(33)``
        # rewrites a scalar) holds true by our factor invariant.  At
        # 30-40 ARPACK iterations per ``modal_solve`` on a 200k-DOF
        # triu K the preamble adds up to ~5% of eigsh wall time.
        return self._solver._call_pardiso(self._A, b)



[docs]
    def refactor(self, A) -> None:
        """Numeric-only refactor reusing the existing analysis (MKL phase 22).

        Pattern must match the matrix passed at construction time.
        Skips the analysis (phase 11 ≈ 30-50 % of an unsymmetric LU
        factor on rotor-scale K and ≈ 10-25 % of an SPD Cholesky),
        which is the win for sweep-style callers (cyclic harmonics,
        parametric modal sweeps where only the matrix values change).

        The cached ``ia_p1`` / ``ja_p1`` arrays from
        :meth:`_install_cached_call_pardiso` stay valid because they
        depend only on the (unchanged) pattern.  We only refresh the
        SPD ``triu`` copy and rebind ``self._A.data`` so the cached
        ``_call_pardiso`` matvec sees the new numeric values.
        """
        if getattr(self, "_complex", False):
            if self._assume_hermitian:
                if getattr(A, "femorph_triu", False):
                    A_new = A.astype(np.complex128)
                else:
                    A_new = sp.triu(A, format="csr").astype(np.complex128)
            else:
                A_new = A.tocsr().astype(np.complex128)
            dummy_dtype = np.complex128
        elif self._assume_spd:
            if getattr(A, "femorph_triu", False):
                A_new = A
            else:
                A_new = sp.triu(A, format="csr").astype(np.float64)
            dummy_dtype = np.float64
        else:
            A_new = A.tocsr().astype(np.float64)
            dummy_dtype = np.float64

        if A_new.nnz != self._A.nnz or A_new.shape != self._A.shape:
            raise ValueError(
                "PardisoSolver.refactor requires identical sparsity pattern "
                f"(original nnz={self._A.nnz}, shape={self._A.shape}; "
                f"got nnz={A_new.nnz}, shape={A_new.shape}).  Construct a "
                "fresh solver instead."
            )

        # Bind the new numeric data; keep the cached 1-based indptr /
        # indices that the patched ``_call_pardiso`` already references.
        self._A = A_new

        # MKL phase 22 = numeric factor only (reuses the analysis from
        # the construction-time phase 11/12).  Bypass pypardiso's
        # ``factorize`` (which always forces phase 12) and drive the
        # cached ``_call_pardiso`` directly with a dummy RHS.  The
        # call ignores ``b`` for non-solve phases.
        self._solver.set_phase(22)
        try:
            self._solver._call_pardiso(self._A, np.zeros((self._A.shape[0], 1), dtype=dummy_dtype))
        finally:
            # Always flip back to solve mode so the cached ``solve``
            # hot path stays valid even on a phase-22 raise.
            self._solver.set_phase(33)



[docs]
    def release(self) -> None:
        """Release MKL's internal factor memory.

        MKL Pardiso holds the numerical factor on a 64-slot ``pt``
        handle array that survives Python-side garbage collection —
        dropping our :class:`PardisoSolver` reference does not free
        it.  ``pypardiso.free_memory(everything=True)`` runs phase
        ``-1``, which tears down the MKL side.  On a 200×200×2
        HEX8 plate that's ~390 MB of permanent factor that would
        otherwise live until process exit; for workflows that run
        multiple solves (modal + static + modal, notebooks iterating
        over meshes) the leak accumulates linearly.

        Idempotent — safe to call multiple times.  Safe to call
        mid-interpreter-shutdown because ``pypardiso.free_memory``
        constructs a tiny dummy ``A`` / ``b`` locally and the MKL
        library handle is already resolved.
        """
        solver = getattr(self, "_solver", None)
        if solver is None:
            return
        # Best-effort: if MKL or pypardiso is already torn down
        # (e.g. interpreter shutdown), swallow — the OS will
        # reclaim everything momentarily anyway.
        with contextlib.suppress(Exception):
            solver.free_memory(everything=True)
        self._solver = None
        self._A = None


    def __del__(self) -> None:
        # GC hook so callers that simply drop the reference (the common
        # case — ARPACK wraps us in a ``LinearOperator`` that goes out
        # of scope when the eigensolve returns) still see MKL release.
        # ``release`` is idempotent + shutdown-safe.
        self.release()