Source code for femorph_solver.solvers.linear._mkl_pardiso

"""Direct ctypes wrapper around MKL Pardiso.

Why not keep using ``pypardiso``?

  * **iparm(28)=1 silently no-ops on this MKL build.** pypardiso sets
    the mixed-precision flag but after ``factorize`` the solver reports
    ``iparm[27]=0`` — MKL chose to keep a double factor anyway.  We
    want a direct path so we can verify (and, where the build allows,
    actually engage) single-precision factorisation that halves
    factor memory.
  * **Per-solve ``A.indptr.astype(int32) + 1`` + ``A.indices.astype +
    1`` churn.** PR #112 patched this with a cached ``_call_pardiso``
    override; doing it in-house from the start avoids the monkey-patch
    entirely and lets us tell MKL we ship 0-based indices
    (``iparm[34]=1``) so no ``+1`` copy ever happens.
  * **Cleanup.** ``pypardiso`` does not call ``phase=-1`` on GC, so the
    MKL factor leaks across repeated solves (see agent-1's #107).  We
    own ``__del__`` here and release explicitly.
  * **Phase control.** pypardiso bundles analyse + factorise into
    ``factorize()``; for user code that wants to re-solve with the same
    pattern but updated values (modal sweeps over perturbed matrices),
    we can split ``phase=11`` (analyse) from ``phase=22`` (factor
    only) and amortise the analyse cost.

The MKL library load path piggybacks on pypardiso's discovery — we
defer to its already-loaded handle when available so we don't
duplicate the mkl_rt search heuristic.  Users who set the
``PYPARDISO_MKL_RT`` environment variable get the same resolution on
this backend.

References
----------
- Schenk, O. & Gärtner, K.  "Solving unsymmetric sparse systems of
  linear equations with PARDISO."  Future Generation Computer Systems
  20 (3), 475-487 (2004).  [the Pardiso algorithm implemented by MKL]
- Schenk, O. & Gärtner, K.  "On fast factorization pivoting methods
  for sparse symmetric indefinite systems."  ETNA 23, 158-179 (2006).
- Intel.  *Intel oneAPI Math Kernel Library Developer Reference*,
  "Intel oneMKL PARDISO — Parallel Direct Sparse Solver Interface."
  [authoritative ``iparm`` / ``phase`` / ``mtype`` spec — the set
  of flags this wrapper drives directly]
- Intel.  *Intel MKL PARDISO out-of-core user guide.*
  [``iparm(60)`` OOC mode + ``MKL_PARDISO_OOC_MAX_CORE_SIZE``
  heuristic exercised by the ``ooc=True`` path]
"""

from __future__ import annotations

import contextlib
import ctypes
import glob
import os
import shutil
import site
import sys
from ctypes import POINTER, byref, c_int32, c_int64
from ctypes.util import find_library
from pathlib import Path
from typing import ClassVar

import numpy as np
import scipy.sparse as sp

from ._base import LinearSolverBase


def _load_mkl_rt() -> ctypes.CDLL | None:
    """Resolve the ``mkl_rt`` shared library using the same heuristic
    pypardiso ships.  Returns ``None`` if no MKL is available.
    """
    # Explicit override wins — matches pypardiso's behaviour.
    custom = os.environ.get("PYPARDISO_MKL_RT")
    if custom:
        try:
            return ctypes.CDLL(custom)
        except OSError:
            return None

    for name in ("mkl_rt", "mkl_rt.1", "mkl_rt.2"):
        path = find_library(name)
        if path:
            try:
                return ctypes.CDLL(path)
            except OSError:
                pass

    # Fall back to pypardiso's glob-into-sys.prefix search.
    for pattern in (
        f"{sys.prefix}/[Ll]ib*/**/*mkl_rt*",
        f"{site.USER_BASE}/[Ll]ib*/**/*mkl_rt*",
    ):
        for path in sorted(glob.glob(pattern, recursive=True), key=len):
            try:
                return ctypes.CDLL(path)
            except OSError:
                pass

    # Last resort: if ``pypardiso`` is installed and has already loaded
    # MKL into our process, reuse its handle.  Creating a throwaway
    # solver instance triggers its internal load — cheap (no factor).
    try:
        import pypardiso  # type: ignore[import-untyped]  # noqa: PLC0415

        return pypardiso.PyPardisoSolver(mtype=11).libmkl
    except ImportError:
        return None


_MKL: ctypes.CDLL | None = _load_mkl_rt()
_HAVE: bool = _MKL is not None

# MKL Pardiso's ``pt`` (internal solver handle) is an array of
# pointer-sized integers — 64 entries, 64-bit on any platform where
# ``sizeof(void*) == 8`` (basically every host running MKL today).
_PT_TYPE = c_int64 if ctypes.sizeof(ctypes.c_void_p) == 8 else c_int32
_PT_NP = np.int64 if ctypes.sizeof(ctypes.c_void_p) == 8 else np.int32


if _HAVE:
    _pardiso = _MKL.pardiso
    _pardiso.argtypes = [
        POINTER(_PT_TYPE),  # pt
        POINTER(c_int32),  # maxfct
        POINTER(c_int32),  # mnum
        POINTER(c_int32),  # mtype
        POINTER(c_int32),  # phase
        POINTER(c_int32),  # n
        ctypes.c_void_p,  # a (double)
        POINTER(c_int32),  # ia
        POINTER(c_int32),  # ja
        POINTER(c_int32),  # perm
        POINTER(c_int32),  # nrhs
        POINTER(c_int32),  # iparm
        POINTER(c_int32),  # msglvl
        ctypes.c_void_p,  # b (double)
        ctypes.c_void_p,  # x (double)
        POINTER(c_int32),  # error
    ]
    _pardiso.restype = None


#: MKL Pardiso matrix types.  SPD = 2 (real symmetric positive
#: definite), symmetric indefinite = -2, real unsymmetric = 11.  We
#: only plug 2 and 11 below because everything else (complex, Hermitian)
#: doesn't fit the modal pipeline's real-double contract.
_MTYPE_SPD = 2
_MTYPE_GENERAL = 11

#: MKL Pardiso negative error codes, per oneMKL reference.  The
#: numeric value is all MKL hands back; this map translates it to a
#: human-readable hint plus (for the OOC-specific codes) the env var
#: or filesystem condition that fixes it.  Used in :meth:`_call` so
#: ``RuntimeError`` messages are actionable instead of "MKL error -9".
_MKL_ERROR_MESSAGES: dict[int, str] = {
    -1: "input inconsistent (check matrix shape, indptr/indices dtype, sorted-indices)",
    -2: "not enough memory for the in-core factorisation — try ooc=True or a larger host",
    -3: "reordering problem — check METIS availability / iparm[1]",
    -4: "zero pivot during numerical factorisation (matrix may be near-singular)",
    -5: "unclassified (internal) MKL error — consider filing upstream",
    -6: "preordering failed",
    -7: "diagonal matrix problem (zero on diagonal?)",
    -8: "32-bit integer overflow — matrix too large for int32 CSR",
    -9: "not enough memory for OOC — raise MKL_PARDISO_OOC_MAX_CORE_SIZE",
    -10: "error opening OOC files — check MKL_PARDISO_OOC_PATH exists + writable",
    -11: "read/write error with OOC files — check disk space + permissions",
    -12: "pardiso_64 called from 32-bit library",
}


def _ooc_preflight(A: sp.sparray) -> None:
    """Validate MKL OOC prerequisites before invoking the factor.

    MKL returns numeric codes (-9, -10, -11) when the OOC path can't
    proceed; callers see ``RuntimeError: MKL Pardiso error -9`` with
    no context.  Check the three most common causes up front and
    raise a descriptive ``RuntimeError`` so the fix is obvious.

    Parameters
    ----------
    A : scipy.sparse
        The matrix about to be factored.  Its ``nnz`` seeds the
        disk-space + max-core-size sanity thresholds below.

    Raises
    ------
    RuntimeError
        If the OOC path + cache directory isn't writable, doesn't
        have plausibly enough free space, or the in-memory budget
        knob is absent or too small.  Message names the env var /
        filesystem condition to change.
    """
    ooc_path = os.environ.get("MKL_PARDISO_OOC_PATH", os.getcwd())
    ooc_dir = Path(ooc_path)
    # MKL writes ``<ooc_path>/pardiso_ooc_*`` files.  If the path
    # *prefix* points at a directory we write into it; otherwise the
    # parent directory must exist + be writable so MKL can create the
    # files.
    check_dir = ooc_dir if ooc_dir.is_dir() else ooc_dir.parent
    if not check_dir.is_dir() or not os.access(check_dir, os.W_OK):
        raise RuntimeError(
            f"OOC: MKL_PARDISO_OOC_PATH={ooc_path!r} is not a writable "
            f"directory (resolved {check_dir}).  Set MKL_PARDISO_OOC_PATH "
            f"to a writable directory before constructing the solver."
        )

    # Disk space: MKL's OOC factor spills on the order of
    # factor_nnz × 12 B + workspace.  We don't know factor_nnz before
    # the symbolic factor runs; upper-bound from A.nnz × 20 as a
    # conservative SPD-3D fill estimate.  Warn-worthy shortfalls
    # raise — the user is better off seeing "disk too small" now
    # than MKL -11 after a 10-minute analyse phase.
    est_spill_bytes = int(A.nnz) * 12 * 20
    try:
        stat = shutil.disk_usage(check_dir)
    except OSError:
        # Exotic filesystem (some mount without statvfs?) — skip the
        # size check rather than fail.
        return
    if stat.free < est_spill_bytes:
        raise RuntimeError(
            f"OOC: {stat.free / 1e9:.1f} GB free at {check_dir}, but "
            f"estimated factor spill is ~{est_spill_bytes / 1e9:.1f} GB "
            f"(20× the matrix nnz as a conservative 3D-SPD fill bound). "
            f"Free up space or point MKL_PARDISO_OOC_PATH at a larger "
            f"filesystem."
        )

    # In-memory budget.  MKL's practical floor is ~500 MB — below
    # that, any non-trivial 3D front returns -9 immediately.
    mcs = os.environ.get("MKL_PARDISO_OOC_MAX_CORE_SIZE")
    if mcs is None:
        # A zero / unset value tells MKL to use its internal default
        # (2000 MB in recent builds), which is fine for matrices up to
        # ~2 M DOF.  Warn via the docstring rather than raising, so
        # small problems don't demand env setup.
        return
    try:
        mcs_int = int(mcs)
    except ValueError as exc:
        raise RuntimeError(
            f"OOC: MKL_PARDISO_OOC_MAX_CORE_SIZE={mcs!r} is not a valid "
            f"integer (MB).  Unset the variable or give it a positive int."
        ) from exc
    if mcs_int < 500:
        raise RuntimeError(
            f"OOC: MKL_PARDISO_OOC_MAX_CORE_SIZE={mcs_int} MB is below "
            f"MKL's practical floor (~500 MB).  Bump it to at least 500 — "
            f"the heuristic ``half the estimated factor size`` usually "
            f"works."
        )



[docs]
class DirectMklPardisoSolver(LinearSolverBase):
    """ctypes-level MKL Pardiso direct solver.

    Parameters
    ----------
    A : scipy.sparse.csr_array
        Real-valued SPD (``assume_spd=True``) or general square sparse
        matrix.  When ``assume_spd=True`` and ``A.femorph_triu`` is
        set, we use the supplied triu storage directly.  Otherwise we
        extract ``sp.triu(A)`` ourselves before handing to MKL.
    assume_spd : bool, default False
        Flip to MKL's ``mtype=2`` real-SPD Cholesky path.
    mixed_precision : bool or None, default None
        ``True`` asks MKL for a single-precision factor with double-
        precision iterative refinement (iparm(28)=1).  ``None`` (auto)
        stays double-precision — the MKL builds we test against
        silently decline this flag, and even when honoured the
        iterative-refinement buffers often undo the factor-memory win.
        ``False`` forces double precision regardless.
    ooc : bool, default False
        ``True`` enables out-of-core factorisation (iparm(60)=2 —
        spill factor to disk).  Trades wall-time for the ability to
        factor K that would otherwise exceed RAM.
    msglvl : int, default 0
        Pass ``1`` to dump MKL's factor statistics to stdout — useful
        when diagnosing whether a flag (MP / OOC / parallel) actually
        engaged.
    """

    name: ClassVar[str] = "mkl_direct"
    kind: ClassVar[str] = "direct"
    spd_only: ClassVar[bool] = False
    install_hint: ClassVar[str] = "same MKL as ``pardiso`` (``pip install pypardiso``)"


[docs]
    @staticmethod
    def available() -> bool:
        return _HAVE


    def __init__(
        self,
        A,
        *,
        assume_spd: bool = False,
        mixed_precision: bool | None = None,
        ooc: bool = False,
        msglvl: int = 0,
    ) -> None:
        # Extend the base contract with ``ooc`` and ``msglvl``.  Must
        # set these before ``super().__init__`` because the base calls
        # ``_factor`` from within its own ``__init__``.
        self._ooc = ooc
        self._msglvl_val = msglvl
        super().__init__(A, assume_spd=assume_spd, mixed_precision=mixed_precision)

    def _factor(self, A):
        if not _HAVE:
            raise ImportError("MKL Pardiso not available")
        if self._assume_spd:
            if getattr(A, "femorph_triu", False):
                self._A = A
            else:
                self._A = sp.triu(A, format="csr").astype(np.float64)
            self._mtype = c_int32(_MTYPE_SPD)
        else:
            self._A = A.tocsr().astype(np.float64)
            self._mtype = c_int32(_MTYPE_GENERAL)
        if self._A.dtype != np.float64:
            self._A = self._A.astype(np.float64)
        if not self._A.has_sorted_indices:
            self._A.sort_indices()

        # Cache int32 CSR arrays so the hot solve path hits a single
        # ctypes.data_as — no ``astype`` copy on every invocation.
        self._indptr = np.ascontiguousarray(self._A.indptr, dtype=np.int32)
        self._indices = np.ascontiguousarray(self._A.indices, dtype=np.int32)
        # ``iparm[34] = 1`` below tells MKL these are 0-based, so no
        # ``+1`` rewrite needed (saving the ~65 MB alloc/solve pypardiso
        # pays at 150²).

        self._n = c_int32(self._A.shape[0])
        self._pt = np.zeros(64, dtype=_PT_NP)
        self._iparm = np.zeros(64, dtype=np.int32)
        # iparm[0] = 1  — do *not* use default iparm; we populate every
        # field we care about explicitly below.  (Default of 0 makes
        # MKL overwrite our choices.)
        self._iparm[0] = 1
        self._iparm[1] = 2  # reordering: METIS (MKL's best in practice)
        self._iparm[9] = 13  # pivot perturbation 1e-13 (SPD default)
        self._iparm[17] = -1  # output: factor nnz reported back
        # iparm[23] = 1: the "improved 2-level" parallel factorisation.
        # On a 128×128×2 HEX8 plate this variant halves per-solve
        # wall-time (30 ms → untuned, 122 ms → 30 ms with it) vs the
        # classical path — MKL parallelises both the level-set walk
        # and the dense block updates.  Peak RSS costs +10-15 % at this
        # scale (per-thread scratch for the level DAG); the wall win
        # dominates so we take the trade.  Users who need the lower
        # RSS can fall back to ``linear_solver="pardiso"``.
        #
        # OOC incompatibility: iparm(23)=1 + iparm(24)=2 is NOT supported
        # in combination with iparm(60)=2 (out-of-core factorisation).
        # MKL returns error -2 (insufficient memory) at phase=12 on any
        # front larger than ~1000 — observed consistently at 160×160×2+.
        # When the caller requests OOC we drop back to the classical
        # parallel path (iparm(23)=0, iparm(24)=0), giving up the
        # ~2× solve-wall win in exchange for OOC actually working.
        if not getattr(self, "_ooc", False):
            self._iparm[23] = 1
            # iparm[24] = 2: "two-level parallel" forward/backward solve —
            # the pair to iparm[23] = 1.  iparm[24] = 1 on its own exposes a
            # memory/perf regression bug in this MKL build that the
            # iparm[23] = 1 + iparm[24] = 2 combination avoids.
            self._iparm[24] = 2
        else:
            self._iparm[23] = 0
            self._iparm[24] = 0
        self._iparm[27] = 0  # double precision factor (default)
        self._iparm[34] = 1  # **0-based CSR** — scipy contract, no +1
        self._iparm[35] = 0  # no Schur complement
        self._iparm[59] = 0  # in-core (default)

        if self._mixed_precision is True:
            self._iparm[27] = 1
        if getattr(self, "_ooc", False):
            # Fail fast on the three OOC setup mistakes before MKL
            # returns a numeric error code with no context.
            _ooc_preflight(self._A)
            self._iparm[59] = 2

        self._perm = np.zeros(self._A.shape[0], dtype=np.int32)
        self._msglvl = c_int32(int(getattr(self, "_msglvl_val", 0)))
        self._phase_33 = c_int32(33)

        # Analyse + factorise in one call (phase 12 = 11 + 22).
        self._call(phase=12)
        # MKL stamps factor stats into iparm[16..18].  Keep the state
        # the solve loop needs on self so __del__ / repeated solves
        # never reach back into these buffers while ctypes pointers
        # are live.
        self._factor_nnz = int(self._iparm[17])
        self._released = False


[docs]
    def solve(self, b: np.ndarray) -> np.ndarray:
        b = np.ascontiguousarray(b, dtype=np.float64)
        x = np.empty_like(b)
        self._call(phase=33, b=b, x=x)
        return x


    def _call(
        self,
        phase: int,
        b: np.ndarray | None = None,
        x: np.ndarray | None = None,
    ) -> None:
        maxfct = c_int32(1)
        mnum = c_int32(1)
        ph = c_int32(phase)
        # nrhs = 1 for our 1-D ARPACK solves.  Multi-RHS (n > 1) would
        # slot in here without a code change — MKL already knows how to
        # vectorise across columns.
        nrhs = c_int32(1 if b is None or b.ndim == 1 else b.shape[1])
        err = c_int32(0)

        if b is None:
            # Phase-12 / phase=-1 don't reference b/x; give MKL
            # zero-length buffers to dereference (any valid ptr works).
            b_arr = np.empty(0, dtype=np.float64)
            x_arr = np.empty(0, dtype=np.float64)
        else:
            b_arr = b
            x_arr = x if x is not None else np.empty_like(b)

        _pardiso(
            self._pt.ctypes.data_as(POINTER(_PT_TYPE)),
            byref(maxfct),
            byref(mnum),
            byref(self._mtype),
            byref(ph),
            byref(self._n),
            self._A.data.ctypes.data_as(ctypes.c_void_p),
            self._indptr.ctypes.data_as(POINTER(c_int32)),
            self._indices.ctypes.data_as(POINTER(c_int32)),
            self._perm.ctypes.data_as(POINTER(c_int32)),
            byref(nrhs),
            self._iparm.ctypes.data_as(POINTER(c_int32)),
            byref(self._msglvl),
            b_arr.ctypes.data_as(ctypes.c_void_p),
            x_arr.ctypes.data_as(ctypes.c_void_p),
            byref(err),
        )
        if err.value != 0:
            hint = _MKL_ERROR_MESSAGES.get(err.value, "see oneMKL Pardiso return codes")
            raise RuntimeError(f"MKL Pardiso error {err.value} in phase {phase}: {hint}")


[docs]
    def release(self) -> None:
        """Release MKL's internal factor memory (phase = -1)."""
        if getattr(self, "_released", True):
            return
        # If MKL's already torn down (interpreter shutdown), let it go.
        with contextlib.suppress(RuntimeError):
            self._call(phase=-1)
        self._released = True


    def __del__(self) -> None:
        # phase=-1 tells MKL to free every internal allocation for this
        # ``pt`` handle.  Without this, repeated PardisoSolver creation
        # accumulates MKL-side factor memory (~factor_nnz × 8 B per
        # solve) — agent-1 caught the same leak in pypardiso (#107).
        # Guard against partial init: if _factor() never ran, _pt
        # doesn't exist and we have nothing to release.
        if getattr(self, "_pt", None) is not None:
            with contextlib.suppress(Exception):
                self.release()