"""Direct ctypes wrapper around MKL Pardiso.
Why not keep using ``pypardiso``?
* **iparm(28)=1 silently no-ops on this MKL build.** pypardiso sets
the mixed-precision flag but after ``factorize`` the solver reports
``iparm[27]=0`` — MKL chose to keep a double factor anyway. We
want a direct path so we can verify (and, where the build allows,
actually engage) single-precision factorisation that halves
factor memory.
* **Per-solve ``A.indptr.astype(int32) + 1`` + ``A.indices.astype +
1`` churn.** PR #112 patched this with a cached ``_call_pardiso``
override; doing it in-house from the start avoids the monkey-patch
entirely and lets us tell MKL we ship 0-based indices
(``iparm[34]=1``) so no ``+1`` copy ever happens.
* **Cleanup.** ``pypardiso`` does not call ``phase=-1`` on GC, so the
MKL factor leaks across repeated solves (see agent-1's #107). We
own ``__del__`` here and release explicitly.
* **Phase control.** pypardiso bundles analyse + factorise into
``factorize()``; for user code that wants to re-solve with the same
pattern but updated values (modal sweeps over perturbed matrices),
we can split ``phase=11`` (analyse) from ``phase=22`` (factor
only) and amortise the analyse cost.
The MKL library load path piggybacks on pypardiso's discovery — we
defer to its already-loaded handle when available so we don't
duplicate the mkl_rt search heuristic. Users who set the
``PYPARDISO_MKL_RT`` environment variable get the same resolution on
this backend.
References
----------
- Schenk, O. & Gärtner, K. "Solving unsymmetric sparse systems of
linear equations with PARDISO." Future Generation Computer Systems
20 (3), 475-487 (2004). [the Pardiso algorithm implemented by MKL]
- Schenk, O. & Gärtner, K. "On fast factorization pivoting methods
for sparse symmetric indefinite systems." ETNA 23, 158-179 (2006).
- Intel. *Intel oneAPI Math Kernel Library Developer Reference*,
"Intel oneMKL PARDISO — Parallel Direct Sparse Solver Interface."
[authoritative ``iparm`` / ``phase`` / ``mtype`` spec — the set
of flags this wrapper drives directly]
- Intel. *Intel MKL PARDISO out-of-core user guide.*
[``iparm(60)`` OOC mode + ``MKL_PARDISO_OOC_MAX_CORE_SIZE``
heuristic exercised by the ``ooc=True`` path]
"""
from __future__ import annotations
import contextlib
import ctypes
import glob
import os
import shutil
import site
import sys
from ctypes import POINTER, byref, c_int32, c_int64
from ctypes.util import find_library
from pathlib import Path
from typing import ClassVar
import numpy as np
import scipy.sparse as sp
from ._base import LinearSolverBase
def _load_mkl_rt() -> ctypes.CDLL | None:
"""Resolve the ``mkl_rt`` shared library using the same heuristic
pypardiso ships. Returns ``None`` if no MKL is available.
"""
# Explicit override wins — matches pypardiso's behaviour.
custom = os.environ.get("PYPARDISO_MKL_RT")
if custom:
try:
return ctypes.CDLL(custom)
except OSError:
return None
for name in ("mkl_rt", "mkl_rt.1", "mkl_rt.2"):
path = find_library(name)
if path:
try:
return ctypes.CDLL(path)
except OSError:
pass
# Fall back to pypardiso's glob-into-sys.prefix search.
for pattern in (
f"{sys.prefix}/[Ll]ib*/**/*mkl_rt*",
f"{site.USER_BASE}/[Ll]ib*/**/*mkl_rt*",
):
for path in sorted(glob.glob(pattern, recursive=True), key=len):
try:
return ctypes.CDLL(path)
except OSError:
pass
# Last resort: if ``pypardiso`` is installed and has already loaded
# MKL into our process, reuse its handle. Creating a throwaway
# solver instance triggers its internal load — cheap (no factor).
try:
import pypardiso # type: ignore[import-untyped] # noqa: PLC0415
return pypardiso.PyPardisoSolver(mtype=11).libmkl
except ImportError:
return None
_MKL: ctypes.CDLL | None = _load_mkl_rt()
_HAVE: bool = _MKL is not None
# MKL Pardiso's ``pt`` (internal solver handle) is an array of
# pointer-sized integers — 64 entries, 64-bit on any platform where
# ``sizeof(void*) == 8`` (basically every host running MKL today).
_PT_TYPE = c_int64 if ctypes.sizeof(ctypes.c_void_p) == 8 else c_int32
_PT_NP = np.int64 if ctypes.sizeof(ctypes.c_void_p) == 8 else np.int32
if _HAVE:
_pardiso = _MKL.pardiso
_pardiso.argtypes = [
POINTER(_PT_TYPE), # pt
POINTER(c_int32), # maxfct
POINTER(c_int32), # mnum
POINTER(c_int32), # mtype
POINTER(c_int32), # phase
POINTER(c_int32), # n
ctypes.c_void_p, # a (double)
POINTER(c_int32), # ia
POINTER(c_int32), # ja
POINTER(c_int32), # perm
POINTER(c_int32), # nrhs
POINTER(c_int32), # iparm
POINTER(c_int32), # msglvl
ctypes.c_void_p, # b (double)
ctypes.c_void_p, # x (double)
POINTER(c_int32), # error
]
_pardiso.restype = None
#: MKL Pardiso matrix types. SPD = 2 (real symmetric positive
#: definite), symmetric indefinite = -2, real unsymmetric = 11. We
#: only plug 2 and 11 below because everything else (complex, Hermitian)
#: doesn't fit the modal pipeline's real-double contract.
_MTYPE_SPD = 2
_MTYPE_GENERAL = 11
#: MKL Pardiso negative error codes, per oneMKL reference. The
#: numeric value is all MKL hands back; this map translates it to a
#: human-readable hint plus (for the OOC-specific codes) the env var
#: or filesystem condition that fixes it. Used in :meth:`_call` so
#: ``RuntimeError`` messages are actionable instead of "MKL error -9".
_MKL_ERROR_MESSAGES: dict[int, str] = {
-1: "input inconsistent (check matrix shape, indptr/indices dtype, sorted-indices)",
-2: "not enough memory for the in-core factorisation — try ooc=True or a larger host",
-3: "reordering problem — check METIS availability / iparm[1]",
-4: "zero pivot during numerical factorisation (matrix may be near-singular)",
-5: "unclassified (internal) MKL error — consider filing upstream",
-6: "preordering failed",
-7: "diagonal matrix problem (zero on diagonal?)",
-8: "32-bit integer overflow — matrix too large for int32 CSR",
-9: "not enough memory for OOC — raise MKL_PARDISO_OOC_MAX_CORE_SIZE",
-10: "error opening OOC files — check MKL_PARDISO_OOC_PATH exists + writable",
-11: "read/write error with OOC files — check disk space + permissions",
-12: "pardiso_64 called from 32-bit library",
}
def _ooc_preflight(A: sp.sparray) -> None:
"""Validate MKL OOC prerequisites before invoking the factor.
MKL returns numeric codes (-9, -10, -11) when the OOC path can't
proceed; callers see ``RuntimeError: MKL Pardiso error -9`` with
no context. Check the three most common causes up front and
raise a descriptive ``RuntimeError`` so the fix is obvious.
Parameters
----------
A : scipy.sparse
The matrix about to be factored. Its ``nnz`` seeds the
disk-space + max-core-size sanity thresholds below.
Raises
------
RuntimeError
If the OOC path + cache directory isn't writable, doesn't
have plausibly enough free space, or the in-memory budget
knob is absent or too small. Message names the env var /
filesystem condition to change.
"""
ooc_path = os.environ.get("MKL_PARDISO_OOC_PATH", os.getcwd())
ooc_dir = Path(ooc_path)
# MKL writes ``<ooc_path>/pardiso_ooc_*`` files. If the path
# *prefix* points at a directory we write into it; otherwise the
# parent directory must exist + be writable so MKL can create the
# files.
check_dir = ooc_dir if ooc_dir.is_dir() else ooc_dir.parent
if not check_dir.is_dir() or not os.access(check_dir, os.W_OK):
raise RuntimeError(
f"OOC: MKL_PARDISO_OOC_PATH={ooc_path!r} is not a writable "
f"directory (resolved {check_dir}). Set MKL_PARDISO_OOC_PATH "
f"to a writable directory before constructing the solver."
)
# Disk space: MKL's OOC factor spills on the order of
# factor_nnz × 12 B + workspace. We don't know factor_nnz before
# the symbolic factor runs; upper-bound from A.nnz × 20 as a
# conservative SPD-3D fill estimate. Warn-worthy shortfalls
# raise — the user is better off seeing "disk too small" now
# than MKL -11 after a 10-minute analyse phase.
est_spill_bytes = int(A.nnz) * 12 * 20
try:
stat = shutil.disk_usage(check_dir)
except OSError:
# Exotic filesystem (some mount without statvfs?) — skip the
# size check rather than fail.
return
if stat.free < est_spill_bytes:
raise RuntimeError(
f"OOC: {stat.free / 1e9:.1f} GB free at {check_dir}, but "
f"estimated factor spill is ~{est_spill_bytes / 1e9:.1f} GB "
f"(20× the matrix nnz as a conservative 3D-SPD fill bound). "
f"Free up space or point MKL_PARDISO_OOC_PATH at a larger "
f"filesystem."
)
# In-memory budget. MKL's practical floor is ~500 MB — below
# that, any non-trivial 3D front returns -9 immediately.
mcs = os.environ.get("MKL_PARDISO_OOC_MAX_CORE_SIZE")
if mcs is None:
# A zero / unset value tells MKL to use its internal default
# (2000 MB in recent builds), which is fine for matrices up to
# ~2 M DOF. Warn via the docstring rather than raising, so
# small problems don't demand env setup.
return
try:
mcs_int = int(mcs)
except ValueError as exc:
raise RuntimeError(
f"OOC: MKL_PARDISO_OOC_MAX_CORE_SIZE={mcs!r} is not a valid "
f"integer (MB). Unset the variable or give it a positive int."
) from exc
if mcs_int < 500:
raise RuntimeError(
f"OOC: MKL_PARDISO_OOC_MAX_CORE_SIZE={mcs_int} MB is below "
f"MKL's practical floor (~500 MB). Bump it to at least 500 — "
f"the heuristic ``half the estimated factor size`` usually "
f"works."
)
[docs]
class DirectMklPardisoSolver(LinearSolverBase):
"""ctypes-level MKL Pardiso direct solver.
Parameters
----------
A : scipy.sparse.csr_array
Real-valued SPD (``assume_spd=True``) or general square sparse
matrix. When ``assume_spd=True`` and ``A.femorph_triu`` is
set, we use the supplied triu storage directly. Otherwise we
extract ``sp.triu(A)`` ourselves before handing to MKL.
assume_spd : bool, default False
Flip to MKL's ``mtype=2`` real-SPD Cholesky path.
mixed_precision : bool or None, default None
``True`` asks MKL for a single-precision factor with double-
precision iterative refinement (iparm(28)=1). ``None`` (auto)
stays double-precision — the MKL builds we test against
silently decline this flag, and even when honoured the
iterative-refinement buffers often undo the factor-memory win.
``False`` forces double precision regardless.
ooc : bool, default False
``True`` enables out-of-core factorisation (iparm(60)=2 —
spill factor to disk). Trades wall-time for the ability to
factor K that would otherwise exceed RAM.
msglvl : int, default 0
Pass ``1`` to dump MKL's factor statistics to stdout — useful
when diagnosing whether a flag (MP / OOC / parallel) actually
engaged.
"""
name: ClassVar[str] = "mkl_direct"
kind: ClassVar[str] = "direct"
spd_only: ClassVar[bool] = False
install_hint: ClassVar[str] = "same MKL as ``pardiso`` (``pip install pypardiso``)"
[docs]
@staticmethod
def available() -> bool:
return _HAVE
def __init__(
self,
A,
*,
assume_spd: bool = False,
mixed_precision: bool | None = None,
ooc: bool = False,
msglvl: int = 0,
) -> None:
# Extend the base contract with ``ooc`` and ``msglvl``. Must
# set these before ``super().__init__`` because the base calls
# ``_factor`` from within its own ``__init__``.
self._ooc = ooc
self._msglvl_val = msglvl
super().__init__(A, assume_spd=assume_spd, mixed_precision=mixed_precision)
def _factor(self, A):
if not _HAVE:
raise ImportError("MKL Pardiso not available")
if self._assume_spd:
if getattr(A, "femorph_triu", False):
self._A = A
else:
self._A = sp.triu(A, format="csr").astype(np.float64)
self._mtype = c_int32(_MTYPE_SPD)
else:
self._A = A.tocsr().astype(np.float64)
self._mtype = c_int32(_MTYPE_GENERAL)
if self._A.dtype != np.float64:
self._A = self._A.astype(np.float64)
if not self._A.has_sorted_indices:
self._A.sort_indices()
# Cache int32 CSR arrays so the hot solve path hits a single
# ctypes.data_as — no ``astype`` copy on every invocation.
self._indptr = np.ascontiguousarray(self._A.indptr, dtype=np.int32)
self._indices = np.ascontiguousarray(self._A.indices, dtype=np.int32)
# ``iparm[34] = 1`` below tells MKL these are 0-based, so no
# ``+1`` rewrite needed (saving the ~65 MB alloc/solve pypardiso
# pays at 150²).
self._n = c_int32(self._A.shape[0])
self._pt = np.zeros(64, dtype=_PT_NP)
self._iparm = np.zeros(64, dtype=np.int32)
# iparm[0] = 1 — do *not* use default iparm; we populate every
# field we care about explicitly below. (Default of 0 makes
# MKL overwrite our choices.)
self._iparm[0] = 1
self._iparm[1] = 2 # reordering: METIS (MKL's best in practice)
self._iparm[9] = 13 # pivot perturbation 1e-13 (SPD default)
self._iparm[17] = -1 # output: factor nnz reported back
# iparm[23] = 1: the "improved 2-level" parallel factorisation.
# On a 128×128×2 HEX8 plate this variant halves per-solve
# wall-time (30 ms → untuned, 122 ms → 30 ms with it) vs the
# classical path — MKL parallelises both the level-set walk
# and the dense block updates. Peak RSS costs +10-15 % at this
# scale (per-thread scratch for the level DAG); the wall win
# dominates so we take the trade. Users who need the lower
# RSS can fall back to ``linear_solver="pardiso"``.
#
# OOC incompatibility: iparm(23)=1 + iparm(24)=2 is NOT supported
# in combination with iparm(60)=2 (out-of-core factorisation).
# MKL returns error -2 (insufficient memory) at phase=12 on any
# front larger than ~1000 — observed consistently at 160×160×2+.
# When the caller requests OOC we drop back to the classical
# parallel path (iparm(23)=0, iparm(24)=0), giving up the
# ~2× solve-wall win in exchange for OOC actually working.
if not getattr(self, "_ooc", False):
self._iparm[23] = 1
# iparm[24] = 2: "two-level parallel" forward/backward solve —
# the pair to iparm[23] = 1. iparm[24] = 1 on its own exposes a
# memory/perf regression bug in this MKL build that the
# iparm[23] = 1 + iparm[24] = 2 combination avoids.
self._iparm[24] = 2
else:
self._iparm[23] = 0
self._iparm[24] = 0
self._iparm[27] = 0 # double precision factor (default)
self._iparm[34] = 1 # **0-based CSR** — scipy contract, no +1
self._iparm[35] = 0 # no Schur complement
self._iparm[59] = 0 # in-core (default)
if self._mixed_precision is True:
self._iparm[27] = 1
if getattr(self, "_ooc", False):
# Fail fast on the three OOC setup mistakes before MKL
# returns a numeric error code with no context.
_ooc_preflight(self._A)
self._iparm[59] = 2
self._perm = np.zeros(self._A.shape[0], dtype=np.int32)
self._msglvl = c_int32(int(getattr(self, "_msglvl_val", 0)))
self._phase_33 = c_int32(33)
# Analyse + factorise in one call (phase 12 = 11 + 22).
self._call(phase=12)
# MKL stamps factor stats into iparm[16..18]. Keep the state
# the solve loop needs on self so __del__ / repeated solves
# never reach back into these buffers while ctypes pointers
# are live.
self._factor_nnz = int(self._iparm[17])
self._released = False
[docs]
def solve(self, b: np.ndarray) -> np.ndarray:
b = np.ascontiguousarray(b, dtype=np.float64)
x = np.empty_like(b)
self._call(phase=33, b=b, x=x)
return x
def _call(
self,
phase: int,
b: np.ndarray | None = None,
x: np.ndarray | None = None,
) -> None:
maxfct = c_int32(1)
mnum = c_int32(1)
ph = c_int32(phase)
# nrhs = 1 for our 1-D ARPACK solves. Multi-RHS (n > 1) would
# slot in here without a code change — MKL already knows how to
# vectorise across columns.
nrhs = c_int32(1 if b is None or b.ndim == 1 else b.shape[1])
err = c_int32(0)
if b is None:
# Phase-12 / phase=-1 don't reference b/x; give MKL
# zero-length buffers to dereference (any valid ptr works).
b_arr = np.empty(0, dtype=np.float64)
x_arr = np.empty(0, dtype=np.float64)
else:
b_arr = b
x_arr = x if x is not None else np.empty_like(b)
_pardiso(
self._pt.ctypes.data_as(POINTER(_PT_TYPE)),
byref(maxfct),
byref(mnum),
byref(self._mtype),
byref(ph),
byref(self._n),
self._A.data.ctypes.data_as(ctypes.c_void_p),
self._indptr.ctypes.data_as(POINTER(c_int32)),
self._indices.ctypes.data_as(POINTER(c_int32)),
self._perm.ctypes.data_as(POINTER(c_int32)),
byref(nrhs),
self._iparm.ctypes.data_as(POINTER(c_int32)),
byref(self._msglvl),
b_arr.ctypes.data_as(ctypes.c_void_p),
x_arr.ctypes.data_as(ctypes.c_void_p),
byref(err),
)
if err.value != 0:
hint = _MKL_ERROR_MESSAGES.get(err.value, "see oneMKL Pardiso return codes")
raise RuntimeError(f"MKL Pardiso error {err.value} in phase {phase}: {hint}")
[docs]
def release(self) -> None:
"""Release MKL's internal factor memory (phase = -1)."""
if getattr(self, "_released", True):
return
# If MKL's already torn down (interpreter shutdown), let it go.
with contextlib.suppress(RuntimeError):
self._call(phase=-1)
self._released = True
def __del__(self) -> None:
# phase=-1 tells MKL to free every internal allocation for this
# ``pt`` handle. Without this, repeated PardisoSolver creation
# accumulates MKL-side factor memory (~factor_nnz × 8 B per
# solve) — agent-1 caught the same leak in pypardiso (#107).
# Guard against partial init: if _factor() never ran, _pt
# doesn't exist and we have nothing to release.
if getattr(self, "_pt", None) is not None:
with contextlib.suppress(Exception):
self.release()