Source code for femorph_solver.solvers.linear._mkl_pardiso

"""Direct ctypes wrapper around MKL Pardiso.

Why not keep using ``pypardiso``?

  * **iparm(28)=1 silently no-ops on this MKL build.** pypardiso sets
    the mixed-precision flag but after ``factorize`` the solver reports
    ``iparm[27]=0`` — MKL chose to keep a double factor anyway.  We
    want a direct path so we can verify (and, where the build allows,
    actually engage) single-precision factorisation that halves
    factor memory.
  * **Per-solve ``A.indptr.astype(int32) + 1`` + ``A.indices.astype +
    1`` churn.** PR #112 patched this with a cached ``_call_pardiso``
    override; doing it in-house from the start avoids the monkey-patch
    entirely and lets us tell MKL we ship 0-based indices
    (``iparm[34]=1``) so no ``+1`` copy ever happens.
  * **Cleanup.** ``pypardiso`` does not call ``phase=-1`` on GC, so the
    MKL factor leaks across repeated solves (see agent-1's #107).  We
    own ``__del__`` here and release explicitly.
  * **Phase control.** pypardiso bundles analyse + factorise into
    ``factorize()``; for user code that wants to re-solve with the same
    pattern but updated values (modal sweeps over perturbed matrices),
    we can split ``phase=11`` (analyse) from ``phase=22`` (factor
    only) and amortise the analyse cost.

The MKL library load path piggybacks on pypardiso's discovery — we
defer to its already-loaded handle when available so we don't
duplicate the mkl_rt search heuristic.  Users who set the
``PYPARDISO_MKL_RT`` environment variable get the same resolution on
this backend.

References
----------
- Schenk, O. & Gärtner, K.  "Solving unsymmetric sparse systems of
  linear equations with PARDISO."  Future Generation Computer Systems
  20 (3), 475-487 (2004).  [the Pardiso algorithm implemented by MKL]
- Schenk, O. & Gärtner, K.  "On fast factorization pivoting methods
  for sparse symmetric indefinite systems."  ETNA 23, 158-179 (2006).
- Intel.  *Intel oneAPI Math Kernel Library Developer Reference*,
  "Intel oneMKL PARDISO — Parallel Direct Sparse Solver Interface."
  [authoritative ``iparm`` / ``phase`` / ``mtype`` spec — the set
  of flags this wrapper drives directly]
- Intel.  *Intel MKL PARDISO out-of-core user guide.*
  [``iparm(60)`` OOC mode + ``MKL_PARDISO_OOC_MAX_CORE_SIZE``
  heuristic exercised by the ``ooc=True`` path]
"""

from __future__ import annotations

import contextlib
import ctypes
import glob
import os
import shutil
import site
import sys
from ctypes import POINTER, byref, c_int32, c_int64
from ctypes.util import find_library
from pathlib import Path
from typing import ClassVar

import numpy as np
import scipy.sparse as sp

from ._base import LinearSolverBase


def _load_mkl_rt() -> ctypes.CDLL | None:
    """Resolve the ``mkl_rt`` shared library using the same heuristic
    pypardiso ships.  Returns ``None`` if no MKL is available.
    """
    # Explicit override wins — matches pypardiso's behaviour.
    custom = os.environ.get("PYPARDISO_MKL_RT")
    if custom:
        try:
            return ctypes.CDLL(custom)
        except OSError:
            return None

    for name in ("mkl_rt", "mkl_rt.1", "mkl_rt.2"):
        path = find_library(name)
        if path:
            try:
                return ctypes.CDLL(path)
            except OSError:
                pass

    # Fall back to pypardiso's glob-into-sys.prefix search.
    for pattern in (
        f"{sys.prefix}/[Ll]ib*/**/*mkl_rt*",
        f"{site.USER_BASE}/[Ll]ib*/**/*mkl_rt*",
    ):
        for path in sorted(glob.glob(pattern, recursive=True), key=len):
            try:
                return ctypes.CDLL(path)
            except OSError:
                pass

    # Last resort: if ``pypardiso`` is installed and has already loaded
    # MKL into our process, reuse its handle.  Creating a throwaway
    # solver instance triggers its internal load — cheap (no factor).
    try:
        import pypardiso  # type: ignore[import-untyped]  # noqa: PLC0415

        return pypardiso.PyPardisoSolver(mtype=11).libmkl
    except ImportError:
        return None


_MKL: ctypes.CDLL | None = _load_mkl_rt()
_HAVE: bool = _MKL is not None

# MKL Pardiso's ``pt`` (internal solver handle) is an array of
# pointer-sized integers — 64 entries, 64-bit on any platform where
# ``sizeof(void*) == 8`` (basically every host running MKL today).
_PT_TYPE = c_int64 if ctypes.sizeof(ctypes.c_void_p) == 8 else c_int32
_PT_NP = np.int64 if ctypes.sizeof(ctypes.c_void_p) == 8 else np.int32


if _HAVE:
    _pardiso = _MKL.pardiso
    _pardiso.argtypes = [
        POINTER(_PT_TYPE),  # pt
        POINTER(c_int32),  # maxfct
        POINTER(c_int32),  # mnum
        POINTER(c_int32),  # mtype
        POINTER(c_int32),  # phase
        POINTER(c_int32),  # n
        ctypes.c_void_p,  # a (double)
        POINTER(c_int32),  # ia
        POINTER(c_int32),  # ja
        POINTER(c_int32),  # perm
        POINTER(c_int32),  # nrhs
        POINTER(c_int32),  # iparm
        POINTER(c_int32),  # msglvl
        ctypes.c_void_p,  # b (double)
        ctypes.c_void_p,  # x (double)
        POINTER(c_int32),  # error
    ]
    _pardiso.restype = None


#: MKL Pardiso matrix types.  SPD = 2 (real symmetric positive
#: definite), symmetric indefinite = -2, real unsymmetric = 11.  We
#: only plug 2 and 11 below because everything else (complex, Hermitian)
#: doesn't fit the modal pipeline's real-double contract.
_MTYPE_SPD = 2
_MTYPE_GENERAL = 11

#: MKL Pardiso negative error codes, per oneMKL reference.  The
#: numeric value is all MKL hands back; this map translates it to a
#: human-readable hint plus (for the OOC-specific codes) the env var
#: or filesystem condition that fixes it.  Used in :meth:`_call` so
#: ``RuntimeError`` messages are actionable instead of "MKL error -9".
_MKL_ERROR_MESSAGES: dict[int, str] = {
    -1: "input inconsistent (check matrix shape, indptr/indices dtype, sorted-indices)",
    -2: "not enough memory for the in-core factorisation — try ooc=True or a larger host",
    -3: "reordering problem — check METIS availability / iparm[1]",
    -4: "zero pivot during numerical factorisation (matrix may be near-singular)",
    -5: "unclassified (internal) MKL error — consider filing upstream",
    -6: "preordering failed",
    -7: "diagonal matrix problem (zero on diagonal?)",
    -8: "32-bit integer overflow — matrix too large for int32 CSR",
    -9: "not enough memory for OOC — raise MKL_PARDISO_OOC_MAX_CORE_SIZE",
    -10: "error opening OOC files — check MKL_PARDISO_OOC_PATH exists + writable",
    -11: "read/write error with OOC files — check disk space + permissions",
    -12: "pardiso_64 called from 32-bit library",
}


def _ooc_preflight(A: sp.sparray) -> None:
    """Validate MKL OOC prerequisites before invoking the factor.

    MKL returns numeric codes (-9, -10, -11) when the OOC path can't
    proceed; callers see ``RuntimeError: MKL Pardiso error -9`` with
    no context.  Check the three most common causes up front and
    raise a descriptive ``RuntimeError`` so the fix is obvious.

    Parameters
    ----------
    A : scipy.sparse
        The matrix about to be factored.  Its ``nnz`` seeds the
        disk-space + max-core-size sanity thresholds below.

    Raises
    ------
    RuntimeError
        If the OOC path + cache directory isn't writable, doesn't
        have plausibly enough free space, or the in-memory budget
        knob is absent or too small.  Message names the env var /
        filesystem condition to change.
    """
    ooc_path = os.environ.get("MKL_PARDISO_OOC_PATH", os.getcwd())
    ooc_dir = Path(ooc_path)
    # MKL writes ``<ooc_path>/pardiso_ooc_*`` files.  If the path
    # *prefix* points at a directory we write into it; otherwise the
    # parent directory must exist + be writable so MKL can create the
    # files.
    check_dir = ooc_dir if ooc_dir.is_dir() else ooc_dir.parent
    if not check_dir.is_dir() or not os.access(check_dir, os.W_OK):
        raise RuntimeError(
            f"OOC: MKL_PARDISO_OOC_PATH={ooc_path!r} is not a writable "
            f"directory (resolved {check_dir}).  Set MKL_PARDISO_OOC_PATH "
            f"to a writable directory before constructing the solver."
        )

    # Disk space: MKL's OOC factor spills on the order of
    # factor_nnz × 12 B + workspace.  We don't know factor_nnz before
    # the symbolic factor runs; upper-bound from A.nnz × 20 as a
    # conservative SPD-3D fill estimate.  Warn-worthy shortfalls
    # raise — the user is better off seeing "disk too small" now
    # than MKL -11 after a 10-minute analyse phase.
    est_spill_bytes = int(A.nnz) * 12 * 20
    try:
        stat = shutil.disk_usage(check_dir)
    except OSError:
        # Exotic filesystem (some mount without statvfs?) — skip the
        # size check rather than fail.
        return
    if stat.free < est_spill_bytes:
        raise RuntimeError(
            f"OOC: {stat.free / 1e9:.1f} GB free at {check_dir}, but "
            f"estimated factor spill is ~{est_spill_bytes / 1e9:.1f} GB "
            f"(20× the matrix nnz as a conservative 3D-SPD fill bound). "
            f"Free up space or point MKL_PARDISO_OOC_PATH at a larger "
            f"filesystem."
        )

    # In-memory budget.  MKL's practical floor is ~500 MB — below
    # that, any non-trivial 3D front returns -9 immediately.
    mcs = os.environ.get("MKL_PARDISO_OOC_MAX_CORE_SIZE")
    if mcs is None:
        # A zero / unset value tells MKL to use its internal default
        # (2000 MB in recent builds), which is fine for matrices up to
        # ~2 M DOF.  Warn via the docstring rather than raising, so
        # small problems don't demand env setup.
        return
    try:
        mcs_int = int(mcs)
    except ValueError as exc:
        raise RuntimeError(
            f"OOC: MKL_PARDISO_OOC_MAX_CORE_SIZE={mcs!r} is not a valid "
            f"integer (MB).  Unset the variable or give it a positive int."
        ) from exc
    if mcs_int < 500:
        raise RuntimeError(
            f"OOC: MKL_PARDISO_OOC_MAX_CORE_SIZE={mcs_int} MB is below "
            f"MKL's practical floor (~500 MB).  Bump it to at least 500 — "
            f"the heuristic ``half the estimated factor size`` usually "
            f"works."
        )


[docs] class DirectMklPardisoSolver(LinearSolverBase): """ctypes-level MKL Pardiso direct solver. Parameters ---------- A : scipy.sparse.csr_array Real-valued SPD (``assume_spd=True``) or general square sparse matrix. When ``assume_spd=True`` and ``A.femorph_triu`` is set, we use the supplied triu storage directly. Otherwise we extract ``sp.triu(A)`` ourselves before handing to MKL. assume_spd : bool, default False Flip to MKL's ``mtype=2`` real-SPD Cholesky path. mixed_precision : bool or None, default None ``True`` asks MKL for a single-precision factor with double- precision iterative refinement (iparm(28)=1). ``None`` (auto) stays double-precision — the MKL builds we test against silently decline this flag, and even when honoured the iterative-refinement buffers often undo the factor-memory win. ``False`` forces double precision regardless. ooc : bool, default False ``True`` enables out-of-core factorisation (iparm(60)=2 — spill factor to disk). Trades wall-time for the ability to factor K that would otherwise exceed RAM. msglvl : int, default 0 Pass ``1`` to dump MKL's factor statistics to stdout — useful when diagnosing whether a flag (MP / OOC / parallel) actually engaged. """ name: ClassVar[str] = "mkl_direct" kind: ClassVar[str] = "direct" spd_only: ClassVar[bool] = False install_hint: ClassVar[str] = "same MKL as ``pardiso`` (``pip install pypardiso``)"
[docs] @staticmethod def available() -> bool: return _HAVE
def __init__( self, A, *, assume_spd: bool = False, mixed_precision: bool | None = None, ooc: bool = False, msglvl: int = 0, ) -> None: # Extend the base contract with ``ooc`` and ``msglvl``. Must # set these before ``super().__init__`` because the base calls # ``_factor`` from within its own ``__init__``. self._ooc = ooc self._msglvl_val = msglvl super().__init__(A, assume_spd=assume_spd, mixed_precision=mixed_precision) def _factor(self, A): if not _HAVE: raise ImportError("MKL Pardiso not available") if self._assume_spd: if getattr(A, "femorph_triu", False): self._A = A else: self._A = sp.triu(A, format="csr").astype(np.float64) self._mtype = c_int32(_MTYPE_SPD) else: self._A = A.tocsr().astype(np.float64) self._mtype = c_int32(_MTYPE_GENERAL) if self._A.dtype != np.float64: self._A = self._A.astype(np.float64) if not self._A.has_sorted_indices: self._A.sort_indices() # Cache int32 CSR arrays so the hot solve path hits a single # ctypes.data_as — no ``astype`` copy on every invocation. self._indptr = np.ascontiguousarray(self._A.indptr, dtype=np.int32) self._indices = np.ascontiguousarray(self._A.indices, dtype=np.int32) # ``iparm[34] = 1`` below tells MKL these are 0-based, so no # ``+1`` rewrite needed (saving the ~65 MB alloc/solve pypardiso # pays at 150²). self._n = c_int32(self._A.shape[0]) self._pt = np.zeros(64, dtype=_PT_NP) self._iparm = np.zeros(64, dtype=np.int32) # iparm[0] = 1 — do *not* use default iparm; we populate every # field we care about explicitly below. (Default of 0 makes # MKL overwrite our choices.) self._iparm[0] = 1 self._iparm[1] = 2 # reordering: METIS (MKL's best in practice) self._iparm[9] = 13 # pivot perturbation 1e-13 (SPD default) self._iparm[17] = -1 # output: factor nnz reported back # iparm[23] = 1: the "improved 2-level" parallel factorisation. # On a 128×128×2 HEX8 plate this variant halves per-solve # wall-time (30 ms → untuned, 122 ms → 30 ms with it) vs the # classical path — MKL parallelises both the level-set walk # and the dense block updates. Peak RSS costs +10-15 % at this # scale (per-thread scratch for the level DAG); the wall win # dominates so we take the trade. Users who need the lower # RSS can fall back to ``linear_solver="pardiso"``. # # OOC incompatibility: iparm(23)=1 + iparm(24)=2 is NOT supported # in combination with iparm(60)=2 (out-of-core factorisation). # MKL returns error -2 (insufficient memory) at phase=12 on any # front larger than ~1000 — observed consistently at 160×160×2+. # When the caller requests OOC we drop back to the classical # parallel path (iparm(23)=0, iparm(24)=0), giving up the # ~2× solve-wall win in exchange for OOC actually working. if not getattr(self, "_ooc", False): self._iparm[23] = 1 # iparm[24] = 2: "two-level parallel" forward/backward solve — # the pair to iparm[23] = 1. iparm[24] = 1 on its own exposes a # memory/perf regression bug in this MKL build that the # iparm[23] = 1 + iparm[24] = 2 combination avoids. self._iparm[24] = 2 else: self._iparm[23] = 0 self._iparm[24] = 0 self._iparm[27] = 0 # double precision factor (default) self._iparm[34] = 1 # **0-based CSR** — scipy contract, no +1 self._iparm[35] = 0 # no Schur complement self._iparm[59] = 0 # in-core (default) if self._mixed_precision is True: self._iparm[27] = 1 if getattr(self, "_ooc", False): # Fail fast on the three OOC setup mistakes before MKL # returns a numeric error code with no context. _ooc_preflight(self._A) self._iparm[59] = 2 self._perm = np.zeros(self._A.shape[0], dtype=np.int32) self._msglvl = c_int32(int(getattr(self, "_msglvl_val", 0))) self._phase_33 = c_int32(33) # Analyse + factorise in one call (phase 12 = 11 + 22). self._call(phase=12) # MKL stamps factor stats into iparm[16..18]. Keep the state # the solve loop needs on self so __del__ / repeated solves # never reach back into these buffers while ctypes pointers # are live. self._factor_nnz = int(self._iparm[17]) self._released = False
[docs] def solve(self, b: np.ndarray) -> np.ndarray: b = np.ascontiguousarray(b, dtype=np.float64) x = np.empty_like(b) self._call(phase=33, b=b, x=x) return x
def _call( self, phase: int, b: np.ndarray | None = None, x: np.ndarray | None = None, ) -> None: maxfct = c_int32(1) mnum = c_int32(1) ph = c_int32(phase) # nrhs = 1 for our 1-D ARPACK solves. Multi-RHS (n > 1) would # slot in here without a code change — MKL already knows how to # vectorise across columns. nrhs = c_int32(1 if b is None or b.ndim == 1 else b.shape[1]) err = c_int32(0) if b is None: # Phase-12 / phase=-1 don't reference b/x; give MKL # zero-length buffers to dereference (any valid ptr works). b_arr = np.empty(0, dtype=np.float64) x_arr = np.empty(0, dtype=np.float64) else: b_arr = b x_arr = x if x is not None else np.empty_like(b) _pardiso( self._pt.ctypes.data_as(POINTER(_PT_TYPE)), byref(maxfct), byref(mnum), byref(self._mtype), byref(ph), byref(self._n), self._A.data.ctypes.data_as(ctypes.c_void_p), self._indptr.ctypes.data_as(POINTER(c_int32)), self._indices.ctypes.data_as(POINTER(c_int32)), self._perm.ctypes.data_as(POINTER(c_int32)), byref(nrhs), self._iparm.ctypes.data_as(POINTER(c_int32)), byref(self._msglvl), b_arr.ctypes.data_as(ctypes.c_void_p), x_arr.ctypes.data_as(ctypes.c_void_p), byref(err), ) if err.value != 0: hint = _MKL_ERROR_MESSAGES.get(err.value, "see oneMKL Pardiso return codes") raise RuntimeError(f"MKL Pardiso error {err.value} in phase {phase}: {hint}")
[docs] def release(self) -> None: """Release MKL's internal factor memory (phase = -1).""" if getattr(self, "_released", True): return # If MKL's already torn down (interpreter shutdown), let it go. with contextlib.suppress(RuntimeError): self._call(phase=-1) self._released = True
def __del__(self) -> None: # phase=-1 tells MKL to free every internal allocation for this # ``pt`` handle. Without this, repeated PardisoSolver creation # accumulates MKL-side factor memory (~factor_nnz × 8 B per # solve) — agent-1 caught the same leak in pypardiso (#107). # Guard against partial init: if _factor() never ran, _pt # doesn't exist and we have nothing to release. if getattr(self, "_pt", None) is not None: with contextlib.suppress(Exception): self.release()