Source code for femorph_solver.estimators._loader

"""Load TA-6 JSON benchmark output into estimator training rows.

Input contract: ``femorph-benchmark-*.json`` produced by
:mod:`femorph_solver.benchmark`.  Schema is append-only and
documented in :file:`doc/source/user-guide/solving/benchmark.rst`;
this loader reads what it needs via ``.get()`` so new fields
don't break older historical files.
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingRow:
    """One training sample for the estimator.

    Derived entirely from TA-6 JSON output; no live solver state
    required.  ``host_signature`` groups rows that share a box so
    the estimator can fit per-host coefficients when enough data
    is available.
    """

    host_signature: str
    n_dof: int
    n_modes: int
    linear_solver: str
    eigen_solver: str
    ooc: bool
    wall_s: float
    eig_s: float
    peak_rss_mb: float
    source_path: Path


def _host_signature_from_report(report: str) -> str:
    """Extract a stable host signature from the Report string.

    The Report block contains ``CPU model`` / ``RAM (MB)`` lines
    we can scrape deterministically.  Missing fields degrade to
    ``"unknown-host"`` so the loader never raises.
    """
    cpu = "unknown"
    ram = "unknown"
    os_line = "unknown"
    for line in (report or "").splitlines():
        if "CPU model" in line and ":" in line:
            cpu = line.split(":", 1)[1].strip()
        elif line.strip().startswith("RAM (MB)") and ":" in line:
            ram = line.split(":", 1)[1].strip()
        elif line.strip().startswith("OS ") and ":" in line and os_line == "unknown":
            os_line = line.split(":", 1)[1].strip()
    return f"{cpu}|{ram}|{os_line}"


[docs] def load_training_rows(paths: list[Path | str] | None = None) -> list[TrainingRow]: """Scan a list of benchmark JSON files into training rows. Parameters ---------- paths Explicit list of JSON file paths. When ``None``, globs the current working directory for ``femorph-benchmark-*.json`` — useful when a user dumps their last few runs alongside the script they're running. Returns ------- list[TrainingRow] Every successful benchmark row, one per entry. Failed rows (``ok != True``) are skipped so the estimator doesn't train on timeouts / crashes. """ if paths is None: paths = sorted(Path.cwd().glob("femorph-benchmark-*.json")) else: paths = [Path(p) for p in paths] rows: list[TrainingRow] = [] for p in paths: try: payload = json.loads(p.read_text()) except (OSError, json.JSONDecodeError): continue host_sig = _host_signature_from_report(payload.get("host_report", "")) for r in payload.get("rows", []): if not r.get("ok"): continue spec = r.get("spec", {}) rows.append( TrainingRow( host_signature=host_sig, n_dof=int(r.get("n_dof", 0) or spec.get("n_dof", 0)), n_modes=int(spec.get("n_modes", 10)), linear_solver=str(spec.get("linear_solver", "auto")), eigen_solver=str(spec.get("eigen_solver", "arpack")), ooc=bool(spec.get("ooc", False)), wall_s=float(r.get("wall_s", 0.0)), eig_s=float(r.get("eig_s", 0.0)), peak_rss_mb=float(r.get("peak_rss_mb", 0.0)), source_path=p, ) ) return rows
[docs] def load_validation_rows( paths: list[Path | str] | None = None, *, host_signature: str = "unknown-host", ) -> list[TrainingRow]: """Scan validation JSON files into estimator training rows. Validation runs (via :mod:`femorph_solver.validation`) write their convergence records through :func:`femorph_solver.validation._report.render_json`. Each record carries per-refinement ``wall_s`` + ``peak_rss_mb`` fields (added in TA-10h-8) — exactly what the estimator training fit consumes. Because the validation reports have one file per study and don't embed a host report, the caller passes an explicit ``host_signature`` — typically derived from a companion :class:`femorph_solver.Report` snapshot. If omitted the loader uses ``"unknown-host"``; the estimator's per-host bucket handling gracefully degrades in that case. Parameters ---------- paths Explicit list of JSON file paths. When ``None``, globs the current working directory for ``femorph-validation-*.json`` and ``validation.json``. host_signature A stable signature for the host these runs came from. Returns ------- list[TrainingRow] One row per refinement × quantity. When the same (problem, refinement) appears in multiple published quantities, the identical wall / peak row is emitted for each quantity — the estimator treats them as duplicate observations of the same solve cost, which is desired. """ if paths is None: cwd = Path.cwd() paths = sorted(cwd.glob("femorph-validation-*.json")) + sorted(cwd.glob("validation.json")) else: paths = [Path(p) for p in paths] rows: list[TrainingRow] = [] for p in paths: try: payload = json.loads(p.read_text()) except (OSError, json.JSONDecodeError): continue if not isinstance(payload, list): continue for rec in payload: for r in rec.get("results", []): # Only rows with timing present are usable. wall = r.get("wall_s") peak = r.get("peak_rss_mb") if wall is None or peak is None: continue mesh = r.get("mesh_params", {}) or {} n_modes = int(mesh.get("n_modes", 1)) rows.append( TrainingRow( host_signature=host_signature, n_dof=int(r.get("n_dof") or 0), n_modes=n_modes, linear_solver="auto", eigen_solver="arpack", ooc=False, wall_s=float(wall), # Validation runs don't separately time the # eigensolve; approximate as the full wall # for modal studies. Static studies leave # this at zero. eig_s=float(wall), peak_rss_mb=float(peak), source_path=p, ) ) return rows