diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..9e5db41 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "algorithms/kyber"] + path = algorithms/kyber + url = https://github.com/pq-crystals/kyber diff --git a/algorithms/kyber b/algorithms/kyber new file mode 160000 index 0000000..4768bd3 --- /dev/null +++ b/algorithms/kyber @@ -0,0 +1 @@ +Subproject commit 4768bd37c02f9c40a46cb49d4d1f4d5e612bb882 diff --git a/analysis/analyze.py b/analysis/analyze.py new file mode 100644 index 0000000..10cad3d --- /dev/null +++ b/analysis/analyze.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 +"""Statistical analysis of pqc-bench results. + +Parses .out files via the Go aggregator, then computes a three-way +decomposition of where speedup originates: + + refo0 → refnv compiler optimisation (O3, no vectorisation) + refnv → ref compiler auto-vectorisation + ref → avx2 hand-written SIMD + +Usage: + # Run aggregator inline: + python3 analysis/analyze.py --data data/raw/kyber + + # Or pre-generate the raw JSON once, then reuse it: + go run ./analysis/cmd/aggregate --raw --out /tmp/bench.json data/raw/kyber + python3 analysis/analyze.py --json /tmp/bench.json + + # Write JSON output for figure generation: + python3 analysis/analyze.py --data data/raw/kyber --out analysis/results.json +""" + +import argparse +import json +import subprocess +import sys +from pathlib import Path + +import numpy as np +from scipy import stats as scipy_stats + +# --------------------------------------------------------------------------- +# Data loading +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +def load_json(path: str) -> list[dict]: + with open(path) as f: + return json.load(f) + + +def run_aggregator(data_dir: str) -> list[dict]: + """Run the Go aggregator and return parsed records.""" + cmd = ["go", "run", "./cmd/aggregate", "--raw", data_dir] + result = subprocess.run(cmd, capture_output=True, text=True, cwd=REPO_ROOT / "analysis") + if result.returncode != 0: + print(f"aggregator failed:\n{result.stderr}", file=sys.stderr) + sys.exit(1) + return json.loads(result.stdout) + + +# --------------------------------------------------------------------------- +# Statistics +# --------------------------------------------------------------------------- + +def cliffs_delta_from_u(u: float, m: int, n: int) -> float: + """Cliff's delta derived from Mann-Whitney U statistic. + + U = number of pairs (faster_i, baseline_j) where faster_i < baseline_j. + delta = (2U - m*n) / (m*n) ∈ [-1, +1] + Positive → faster dominates baseline. + """ + return (2 * u - m * n) / (m * n) + + +def bootstrap_speedup_ci( + baseline: np.ndarray, + faster: np.ndarray, + n_boot: int = 5_000, + ci: float = 0.95, + rng: np.random.Generator | None = None, +) -> tuple[float, float]: + """95% bootstrap CI for speedup = median(baseline) / median(faster). + + Resamples both arrays independently using vectorised indexing; returns (lo, hi). + """ + if rng is None: + rng = np.random.default_rng(42) + m, n = len(baseline), len(faster) + # Draw all indices at once: shape (n_boot, m) and (n_boot, n) + bi = rng.integers(0, m, size=(n_boot, m)) + fi = rng.integers(0, n, size=(n_boot, n)) + b_samples = baseline[bi] # (n_boot, m) + f_samples = faster[fi] # (n_boot, n) + # Median along axis=1 for each bootstrap replicate + ratios = np.median(b_samples, axis=1) / np.median(f_samples, axis=1) + alpha = (1 - ci) / 2 + return float(np.percentile(ratios, alpha * 100)), float(np.percentile(ratios, (1 - alpha) * 100)) + + +def compare(baseline: np.ndarray, faster: np.ndarray, rng: np.random.Generator) -> dict: + """Full pairwise comparison: speedup + CI + Mann-Whitney + Cliff's delta.""" + speedup = float(np.median(baseline)) / float(np.median(faster)) + ci_lo, ci_hi = bootstrap_speedup_ci(baseline, faster, rng=rng) + + # One-sided Mann-Whitney: is faster < baseline in cycle counts? + m, n = len(faster), len(baseline) + u_stat, p_val = scipy_stats.mannwhitneyu(faster, baseline, alternative="less") + + # Cliff's delta derived from U — O(n log n), same cost as Mann-Whitney + delta = cliffs_delta_from_u(float(u_stat), m, n) + + return { + "speedup": speedup, + "ci95": [ci_lo, ci_hi], + "mannwhitney_p": float(p_val), + "cliffs_delta": delta, + "n_baseline": n, + "n_faster": m, + } + + +# --------------------------------------------------------------------------- +# Analysis +# --------------------------------------------------------------------------- + +VARIANTS = ("refo0", "refnv", "ref", "avx2") + +# Canonical operation order for display +OP_ORDER = [ + "NTT", "INVNTT", "basemul", "frommsg", + "gen_a", "poly_getnoise_eta1", "poly_getnoise_eta2", + "keygen", "enc", "dec", +] + + +def analyze(records: list[dict]) -> list[dict]: + # Build lookup: (algorithm, variant, operation) → raw array + raw: dict[tuple[str, str, str], np.ndarray] = {} + for r in records: + if r.get("raw"): + raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64) + + # Collect all (algorithm, operation) pairs present across all variants + alg_ops = sorted( + {(alg, op) for alg, var, op in raw}, + key=lambda x: (x[0], _op_rank(x[1])), + ) + + rng = np.random.default_rng(42) + results = [] + for alg, op in alg_ops: + arrays = {v: raw[(alg, v, op)] for v in VARIANTS if (alg, v, op) in raw} + + if len(arrays) < 2: + continue + + row: dict = { + "algorithm": alg, + "operation": op, + "medians": {v: float(np.median(a)) for v, a in arrays.items()}, + "n_obs": {v: len(a) for v, a in arrays.items()}, + "comparisons": {}, + } + + comps = row["comparisons"] + + # Three-way decomposition (each step requires both variants present) + if "refo0" in arrays and "refnv" in arrays: + comps["refo0_to_refnv"] = compare(arrays["refo0"], arrays["refnv"], rng) + + if "refnv" in arrays and "ref" in arrays: + comps["refnv_to_ref"] = compare(arrays["refnv"], arrays["ref"], rng) + + if "ref" in arrays and "avx2" in arrays: + comps["ref_to_avx2"] = compare(arrays["ref"], arrays["avx2"], rng) + + # Totals + if "refo0" in arrays and "ref" in arrays: + comps["refo0_to_ref"] = compare(arrays["refo0"], arrays["ref"], rng) + + if "refo0" in arrays and "avx2" in arrays: + comps["refo0_to_avx2"] = compare(arrays["refo0"], arrays["avx2"], rng) + + results.append(row) + + return results + + +def _op_rank(op: str) -> int: + try: + return OP_ORDER.index(op) + except ValueError: + return len(OP_ORDER) + + +# --------------------------------------------------------------------------- +# Display +# --------------------------------------------------------------------------- + +def _fmt_speedup(comp: dict | None) -> str: + if comp is None: + return " — " + r = comp["speedup"] + lo, hi = comp["ci95"] + return f"{r:5.2f}x [{lo:.2f},{hi:.2f}]" + + +def _fmt_delta(comp: dict | None) -> str: + if comp is None: + return " —" + return f"{comp['cliffs_delta']:+.3f}" + + +def _fmt_p(comp: dict | None) -> str: + if comp is None: + return " —" + p = comp["mannwhitney_p"] + if p < 1e-300: + return " <1e-300" + if p < 1e-10: + return f" {p:.1e}" + return f" {p:.4f}" + + +def print_table(results: list[dict]) -> None: + algs = sorted({r["algorithm"] for r in results}) + + for alg in algs: + rows = [r for r in results if r["algorithm"] == alg] + rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0)) + + print(f"\n{'─'*110}") + print(f" {alg.upper()}") + print(f"{'─'*110}") + print( + f" {'Operation':<24}" + f" {'O3 (no-vec)':>18}" # refo0→refnv + f" {'Auto-vec':>18}" # refnv→ref + f" {'Hand SIMD':>18}" # ref→avx2 + f" {'Total':>18}" # refo0→avx2 + f" {'Cliff δ':>7}" + f" {'p-value':>9}" + ) + print(f" {'':─<24} {'':─<18} {'':─<18} {'':─<18} {'':─<18} {'':─<7} {'':─<9}") + + for r in rows: + c = r["comparisons"] + print( + f" {r['operation']:<24}" + f" {_fmt_speedup(c.get('refo0_to_refnv')):>18}" + f" {_fmt_speedup(c.get('refnv_to_ref')):>18}" + f" {_fmt_speedup(c.get('ref_to_avx2')):>18}" + f" {_fmt_speedup(c.get('refo0_to_avx2')):>18}" + f" {_fmt_delta(c.get('ref_to_avx2')):>7}" + f" {_fmt_p(c.get('ref_to_avx2')):>9}" + ) + + print(f"\n{'─'*110}") + print(" Speedup = median(baseline) / median(variant); CI: 95% bootstrap (5000 iterations)") + print(" Cliff δ and p-value are for ref → avx2 comparison (H1: avx2 cycles < ref cycles)") + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser(description="Statistical analysis of pqc-bench results") + src = parser.add_mutually_exclusive_group(required=True) + src.add_argument("--data", metavar="DIR", help="data directory (runs Go aggregator)") + src.add_argument("--json", metavar="FILE", help="pre-generated aggregate JSON with --raw") + parser.add_argument("--out", metavar="FILE", help="write analysis JSON to this file") + args = parser.parse_args() + + if args.json: + records = load_json(args.json) + print(f"Loaded {len(records)} groups from {args.json}.", file=sys.stderr) + else: + print("Running aggregator...", file=sys.stderr) + records = run_aggregator(args.data) + print(f"Loaded {len(records)} groups.", file=sys.stderr) + + results = analyze(records) + print_table(results) + + if args.out: + with open(args.out, "w") as f: + json.dump(results, f, indent=2) + print(f"\nWrote analysis JSON to {args.out}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/analysis/cmd/aggregate/main.go b/analysis/cmd/aggregate/main.go new file mode 100644 index 0000000..95689ee --- /dev/null +++ b/analysis/cmd/aggregate/main.go @@ -0,0 +1,215 @@ +// aggregate parses pqc-bench .out files and emits summary statistics as JSON. +// +// Usage: +// +// aggregate [--raw] [--out results.json] +// +// It walks for all *.out files, grouping results by the parent +// directory name (algorithm) and the variant inferred from the SLURM header. +// Output is a JSON array of result objects, one per (algorithm, variant, +// operation) triple. +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/fs" + "os" + "path/filepath" + "slices" + "strings" + + "git.levineuwirth.org/neuwirth/where-simd-helps/analysis/pkg/parse" + "git.levineuwirth.org/neuwirth/where-simd-helps/analysis/pkg/stats" +) + +// Result is one output record: all statistics for a single +// (algorithm, variant, operation) group. +type Result struct { + Algorithm string `json:"algorithm"` + Variant string `json:"variant"` + Operation string `json:"operation"` + Unit string `json:"unit"` + NObservations int `json:"n_observations"` + NRuns int `json:"n_runs"` + Median float64 `json:"median"` + Mean float64 `json:"mean"` + Std float64 `json:"std"` + MAD float64 `json:"mad"` + P5 float64 `json:"p5"` + P25 float64 `json:"p25"` + P75 float64 `json:"p75"` + P95 float64 `json:"p95"` + P99 float64 `json:"p99"` + CI95 [2]float64 `json:"ci95"` + Node string `json:"node"` + Sources []string `json:"sources"` + Raw []int64 `json:"raw,omitempty"` +} + +// groupKey uniquely identifies a (algorithm, variant, operation) combination. +type groupKey struct { + algorithm, variant, operation string +} + +func main() { + rawFlag := flag.Bool("raw", false, "include per-observation cycle counts in output") + outFlag := flag.String("out", "", "write JSON output to this file instead of stdout") + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: aggregate [--raw] [--out FILE] \n") + flag.PrintDefaults() + } + flag.Parse() + + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + dataDir := flag.Arg(0) + + // Collect all .out files. + var outFiles []string + err := filepath.WalkDir(dataDir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if !d.IsDir() && strings.HasSuffix(path, ".out") { + outFiles = append(outFiles, path) + } + return nil + }) + if err != nil { + fmt.Fprintf(os.Stderr, "error walking %s: %v\n", dataDir, err) + os.Exit(1) + } + if len(outFiles) == 0 { + fmt.Fprintf(os.Stderr, "no .out files found under %s\n", dataDir) + os.Exit(1) + } + + // Parse every file and accumulate observations per group. + type accumulator struct { + values []int64 + sources []string + node string + } + groups := make(map[groupKey]*accumulator) + + for _, path := range outFiles { + run, err := parse.ParseFile(path) + if err != nil { + fmt.Fprintf(os.Stderr, "warning: skipping %s: %v\n", path, err) + continue + } + + algorithm := inferAlgorithm(run.Meta, path) + variant := parse.InferVariant(run.Meta) + + for _, spin := range run.Spins { + for op, m := range spin { + key := groupKey{algorithm, variant, op} + acc := groups[key] + if acc == nil { + acc = &accumulator{node: run.Meta.Node} + groups[key] = acc + } + acc.values = append(acc.values, m.Median) + } + } + + // Record sources per group (any key with this algorithm+variant). + for key, acc := range groups { + if key.algorithm == algorithm && key.variant == variant { + if !slices.Contains(acc.sources, path) { + acc.sources = append(acc.sources, path) + } + } + } + } + + // Build results. + results := make([]Result, 0, len(groups)) + for key, acc := range groups { + sorted := make([]int64, len(acc.values)) + copy(sorted, acc.values) + stats.SortInt64(sorted) + + s := stats.Compute(sorted) + + r := Result{ + Algorithm: key.algorithm, + Variant: key.variant, + Operation: key.operation, + Unit: "cycles", + NObservations: s.N, + NRuns: len(acc.sources), + Median: s.Median, + Mean: s.Mean, + Std: s.Std, + MAD: s.MAD, + P5: s.P5, + P25: s.P25, + P75: s.P75, + P95: s.P95, + P99: s.P99, + CI95: s.CI95, + Node: acc.node, + Sources: acc.sources, + } + if *rawFlag { + r.Raw = acc.values + } + results = append(results, r) + } + + // Sort for stable output: algorithm → variant → operation. + slices.SortFunc(results, func(a, b Result) int { + if a.Algorithm != b.Algorithm { + return strings.Compare(a.Algorithm, b.Algorithm) + } + if a.Variant != b.Variant { + return strings.Compare(a.Variant, b.Variant) + } + return strings.Compare(a.Operation, b.Operation) + }) + + out, err := json.MarshalIndent(results, "", " ") + if err != nil { + fmt.Fprintf(os.Stderr, "error marshalling JSON: %v\n", err) + os.Exit(1) + } + + if *outFlag != "" { + if err := os.WriteFile(*outFlag, out, 0o644); err != nil { + fmt.Fprintf(os.Stderr, "error writing %s: %v\n", *outFlag, err) + os.Exit(1) + } + fmt.Fprintf(os.Stderr, "wrote %d results to %s\n", len(results), *outFlag) + } else { + fmt.Println(string(out)) + } +} + +// inferAlgorithm returns the algorithm name (e.g. "mlkem512") for a run. +// +// Priority: +// 1. BENCH_PARAM metadata → "mlkem{PARAM}" (new-style runs via submit.sh) +// 2. Walk the file path upward for a segment matching "mlkem\d+" (handles +// both flat old-style layout and new nested layout transparently) +// 3. The immediate parent directory name as a last resort. +func inferAlgorithm(meta parse.Meta, filePath string) string { + if meta.BenchParam != "" { + return "mlkem" + meta.BenchParam + } + // Walk path components looking for mlkem\d+. + dir := filepath.Dir(filePath) + for dir != "." && dir != "/" { + base := filepath.Base(dir) + if strings.HasPrefix(base, "mlkem") { + return base + } + dir = filepath.Dir(dir) + } + return filepath.Base(filepath.Dir(filePath)) +} diff --git a/analysis/cmd/analyze-simd/main.go b/analysis/cmd/analyze-simd/main.go new file mode 100644 index 0000000..3753a41 --- /dev/null +++ b/analysis/cmd/analyze-simd/main.go @@ -0,0 +1,242 @@ +// analyze-simd computes speedup ratios from aggregated pqc-bench results. +// +// Usage: +// +// analyze-simd [--baseline ref] [--in results.json] [--out speedups.json] +// +// It reads the JSON produced by 'aggregate', computes per-operation speedups +// relative to the baseline variant, and emits both a human-readable table +// and a structured JSON file suitable for downstream plotting. +package main + +import ( + "cmp" + "encoding/json" + "flag" + "fmt" + "math" + "os" + "slices" + "strings" + "text/tabwriter" +) + +// Record mirrors the aggregate output schema (fields we need). +type Record struct { + Algorithm string `json:"algorithm"` + Variant string `json:"variant"` + Operation string `json:"operation"` + Median float64 `json:"median"` + CI95 [2]float64 `json:"ci95"` + NRuns int `json:"n_runs"` +} + +// Speedup is one variant-vs-baseline comparison for a single (algorithm, operation). +type Speedup struct { + Variant string `json:"variant"` + Median float64 `json:"median"` + Speedup float64 `json:"speedup"` + SpeedupCI [2]float64 `json:"speedup_ci95"` +} + +// Result is one output row: all comparisons for one (algorithm, operation) pair. +type Result struct { + Algorithm string `json:"algorithm"` + Operation string `json:"operation"` + BaselineVariant string `json:"baseline_variant"` + BaselineMedian float64 `json:"baseline_median"` + BaselineCI95 [2]float64 `json:"baseline_ci95"` + Comparisons []Speedup `json:"comparisons"` +} + +func main() { + baseline := flag.String("baseline", "ref", "variant to use as the speedup denominator") + inFile := flag.String("in", "results/kyber.json", "input JSON from aggregate") + outFile := flag.String("out", "", "write speedup JSON to this file (default: stdout)") + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: analyze-simd [--baseline VARIANT] [--in FILE] [--out FILE]\n") + flag.PrintDefaults() + } + flag.Parse() + + raw, err := os.ReadFile(*inFile) + if err != nil { + fmt.Fprintf(os.Stderr, "error reading %s: %v\n", *inFile, err) + os.Exit(1) + } + var records []Record + if err := json.Unmarshal(raw, &records); err != nil { + fmt.Fprintf(os.Stderr, "error parsing JSON: %v\n", err) + os.Exit(1) + } + + // Index by (algorithm, variant, operation). + type key struct{ algorithm, variant, operation string } + idx := make(map[key]Record, len(records)) + for _, r := range records { + idx[key{r.Algorithm, r.Variant, r.Operation}] = r + } + + // Collect sorted unique values for stable output. + algorithms := unique(records, func(r Record) string { return r.Algorithm }) + operations := unique(records, func(r Record) string { return r.Operation }) + variants := unique(records, func(r Record) string { return r.Variant }) + // Remove baseline from comparison variants. + variants = slices.DeleteFunc(variants, func(v string) bool { return v == *baseline }) + + // Build results. + var results []Result + for _, alg := range algorithms { + for _, op := range operations { + baseRec, ok := idx[key{alg, *baseline, op}] + if !ok || baseRec.Median == 0 { + continue + } + res := Result{ + Algorithm: alg, + Operation: op, + BaselineVariant: *baseline, + BaselineMedian: baseRec.Median, + BaselineCI95: baseRec.CI95, + } + for _, v := range variants { + cmpRec, ok := idx[key{alg, v, op}] + if !ok || cmpRec.Median == 0 { + continue + } + sp := baseRec.Median / cmpRec.Median + // Conservative CI: ratio of interval bounds. + // speedup_lo = baseline_lo / cmp_hi + // speedup_hi = baseline_hi / cmp_lo + var spCI [2]float64 + if cmpRec.CI95[1] > 0 { + spCI[0] = safeDiv(baseRec.CI95[0], cmpRec.CI95[1]) + } + if cmpRec.CI95[0] > 0 { + spCI[1] = safeDiv(baseRec.CI95[1], cmpRec.CI95[0]) + } + res.Comparisons = append(res.Comparisons, Speedup{ + Variant: v, + Median: cmpRec.Median, + Speedup: sp, + SpeedupCI: spCI, + }) + } + if len(res.Comparisons) > 0 { + results = append(results, res) + } + } + } + + // Print human-readable table to stderr. + printTable(os.Stderr, results, variants, *baseline) + + // Emit JSON. + out, err := json.MarshalIndent(results, "", " ") + if err != nil { + fmt.Fprintf(os.Stderr, "error marshalling JSON: %v\n", err) + os.Exit(1) + } + if *outFile != "" { + if err := os.WriteFile(*outFile, out, 0o644); err != nil { + fmt.Fprintf(os.Stderr, "error writing %s: %v\n", *outFile, err) + os.Exit(1) + } + fmt.Fprintf(os.Stderr, "wrote %d results to %s\n", len(results), *outFile) + } else { + fmt.Println(string(out)) + } +} + +func printTable(w *os.File, results []Result, variants []string, baseline string) { + tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + + // Group by algorithm. + byAlg := make(map[string][]Result) + for _, r := range results { + byAlg[r.Algorithm] = append(byAlg[r.Algorithm], r) + } + algs := make([]string, 0, len(byAlg)) + for a := range byAlg { + algs = append(algs, a) + } + slices.Sort(algs) + + for _, alg := range algs { + fmt.Fprintf(tw, "\n── %s (baseline: %s) ──\n", strings.ToUpper(alg), baseline) + + // Header. + var hdr strings.Builder + fmt.Fprintf(&hdr, "%-38s\t%12s", "operation", baseline+"(cycles)") + for _, v := range variants { + fmt.Fprintf(&hdr, "\t%10s", v) + } + fmt.Fprintln(tw, hdr.String()) + fmt.Fprintln(tw, strings.Repeat("-", 38+13+11*len(variants))) + + rows := byAlg[alg] + slices.SortFunc(rows, func(a, b Result) int { + // Sort by descending avx2 speedup if available, else alphabetically. + sa := speedupFor(a, "avx2") + sb := speedupFor(b, "avx2") + if sa != sb { + return cmp.Compare(sb, sa) // descending + } + return strings.Compare(a.Operation, b.Operation) + }) + + for _, r := range rows { + var line strings.Builder + fmt.Fprintf(&line, "%-38s\t%12s", r.Operation, formatCycles(r.BaselineMedian)) + for _, v := range variants { + sp := speedupFor(r, v) + if math.IsNaN(sp) { + fmt.Fprintf(&line, "\t%10s", "---") + } else { + fmt.Fprintf(&line, "\t%9.2fx", sp) + } + } + fmt.Fprintln(tw, line.String()) + } + } + tw.Flush() +} + +func speedupFor(r Result, variant string) float64 { + for _, c := range r.Comparisons { + if c.Variant == variant { + return c.Speedup + } + } + return math.NaN() +} + +func formatCycles(c float64) string { + if c >= 1_000_000 { + return fmt.Sprintf("%.2fM", c/1_000_000) + } + if c >= 1_000 { + return fmt.Sprintf("%.1fK", c/1_000) + } + return fmt.Sprintf("%.0f", c) +} + +func safeDiv(a, b float64) float64 { + if b == 0 { + return 0 + } + return a / b +} + +func unique(records []Record, fn func(Record) string) []string { + seen := make(map[string]struct{}) + for _, r := range records { + seen[fn(r)] = struct{}{} + } + out := make([]string, 0, len(seen)) + for k := range seen { + out = append(out, k) + } + slices.Sort(out) + return out +} diff --git a/analysis/figures.py b/analysis/figures.py new file mode 100644 index 0000000..d03b4ca --- /dev/null +++ b/analysis/figures.py @@ -0,0 +1,487 @@ +#!/usr/bin/env python3 +"""Matplotlib draft figures for the PQC SIMD speedup analysis. + +Usage: + python3 analysis/figures.py [--json analysis/results.json] [--out figures/] +""" + +import argparse +import json +from pathlib import Path + +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- + +# Cumulative stages used in Figure 1 (each shows total speedup from refo0) +STAGE_KEYS = ["refo0_to_refnv", "refo0_to_ref", "refo0_to_avx2"] +STAGE_LABELS = ["O3, no auto-vec", "O3 + auto-vec", "O3 + hand SIMD (avx2)"] +STAGE_COLORS = ["#4C72B0", "#55A868", "#C44E52"] + +# Ops to show in the primary figures (excludes top-level KEM wrappers) +PRIMARY_OPS = { + "poly_frommsg", "INVNTT", "polyvec_basemul_acc_montgomery", "NTT", + "indcpa_dec", "polyvec_decompress", "poly_decompress", + "poly_compress", "poly_tomsg", "polyvec_compress", + "indcpa_enc", "indcpa_keypair", "gen_a", + "poly_getnoise_eta1", "poly_getnoise_eta2", +} + +# Short display names +OP_SHORT = { + "poly_frommsg": "frommsg", + "INVNTT": "INVNTT", + "polyvec_basemul_acc_montgomery": "basemul", + "NTT": "NTT", + "indcpa_dec": "dec", + "polyvec_decompress": "pvec_decomp", + "poly_decompress": "poly_decomp", + "poly_compress": "poly_comp", + "poly_tomsg": "tomsg", + "polyvec_compress": "pvec_comp", + "indcpa_enc": "enc", + "indcpa_keypair": "keypair", + "gen_a": "gen_a", + "poly_getnoise_eta1": "noise_η₁", + "poly_getnoise_eta2": "noise_η₂", +} + +ALGORITHMS = ["mlkem512", "mlkem768", "mlkem1024"] +ALG_TITLES = {"mlkem512": "ML-KEM-512", "mlkem768": "ML-KEM-768", "mlkem1024": "ML-KEM-1024"} + +# Operations selected to illustrate the distribution figure: +# one high-speedup arithmetic op, one medium SHAKE-bound op, one low-speedup op +DIST_OPS = [ + ("INVNTT", "INVNTT\n(~55× speedup)"), + ("gen_a", "gen_a\n(~4× speedup)"), + ("poly_getnoise_eta1","noise η₁\n(~1.3× speedup)"), +] + +# Per-polynomial ops whose speedup should be param-independent +CROSS_PARAM_OPS = [ + "poly_frommsg", + "INVNTT", + "polyvec_basemul_acc_montgomery", + "NTT", +] + +# KEM-level ops for supplementary +KEM_OPS = ["kyber_keypair", "kyber_encaps", "kyber_decaps"] +KEM_SHORT = {"kyber_keypair": "KeyGen", "kyber_encaps": "Encaps", "kyber_decaps": "Decaps"} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def load(json_path: str) -> list[dict]: + with open(json_path) as f: + return json.load(f) + + +def ops_for_alg(results: list[dict], alg: str) -> list[dict]: + rows = [r for r in results if r["algorithm"] == alg and r["operation"] in PRIMARY_OPS] + rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0)) + return rows + + +# --------------------------------------------------------------------------- +# Figure 1: cumulative grouped bars — speedup at each optimisation stage +# +# Each group shows three bars for one operation: +# refo0→refnv total speedup with O3, auto-vec OFF +# refo0→ref total speedup with O3, auto-vec ON +# refo0→avx2 total speedup with O3 + hand-written SIMD +# +# Because all bars share the same baseline (refo0=1), they are directly +# comparable without any additive/multiplicative ambiguity. +# --------------------------------------------------------------------------- + +def fig_decomposition(results: list[dict], out_dir: Path) -> None: + fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=False) + + for ax, alg in zip(axes, ALGORITHMS): + rows = ops_for_alg(results, alg) + if not rows: + ax.set_visible(False) + continue + + ops = [OP_SHORT.get(r["operation"], r["operation"]) for r in rows] + n = len(rows) + group = np.arange(n) + # Three bars per group, evenly spaced within each group slot + bar_w = 0.22 + offsets = np.array([-bar_w, 0, bar_w]) + + for (key, label, color), offset in zip( + zip(STAGE_KEYS, STAGE_LABELS, STAGE_COLORS), offsets + ): + vals = np.array([r["comparisons"].get(key, {}).get("speedup", 0.0) for r in rows]) + ci_lo = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[0] for r in rows]) + ci_hi = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[1] for r in rows]) + yerr = np.array([vals - ci_lo, ci_hi - vals]) + mask = vals > 0 + + ax.bar(group[mask] + offset, vals[mask], bar_w, + label=label, color=color, alpha=0.88, zorder=3) + ax.errorbar(group[mask] + offset, vals[mask], yerr=yerr[:, mask], + fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4) + + ax.set_yscale("log") + ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×")) + ax.set_title(ALG_TITLES[alg], fontsize=12, fontweight="bold") + ax.set_xticks(group) + ax.set_xticklabels(ops, rotation=45, ha="right", fontsize=8) + ax.set_ylabel("Speedup over -O0 (×, log scale)" if alg == "mlkem512" else "") + ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0) + ax.set_axisbelow(True) + ax.set_xlim(-0.5, n - 0.5) + + handles, labels = axes[0].get_legend_handles_labels() + fig.legend(handles, labels, loc="upper center", ncol=3, + fontsize=10, frameon=True, bbox_to_anchor=(0.5, 1.02)) + fig.suptitle( + "ML-KEM Cumulative Speedup at Each Optimisation Stage " + "(Intel Xeon Platinum 8268, 95% bootstrap CI)", + fontsize=11, y=1.06, + ) + fig.tight_layout() + _save(fig, out_dir, "decomposition") + + +# --------------------------------------------------------------------------- +# Figure 2: hand-SIMD speedup (ref→avx2), all algorithms overlaid, log scale +# --------------------------------------------------------------------------- + +def fig_hand_simd(results: list[dict], out_dir: Path) -> None: + all_ops: dict[str, dict] = {} + for r in results: + if r["operation"] in PRIMARY_OPS and "ref_to_avx2" in r["comparisons"]: + all_ops.setdefault(r["operation"], {}) + all_ops[r["operation"]][r["algorithm"]] = r["comparisons"]["ref_to_avx2"] + + ops_sorted = sorted( + all_ops, + key=lambda op: -all_ops[op].get("mlkem512", {}).get("speedup", 0), + ) + short_ops = [OP_SHORT.get(op, op) for op in ops_sorted] + + x = np.arange(len(ops_sorted)) + bar_w = 0.25 + offsets = [-bar_w, 0, bar_w] + colors = ["#4C72B0", "#55A868", "#C44E52"] + + fig, ax = plt.subplots(figsize=(14, 5)) + + for alg, offset, color in zip(ALGORITHMS, offsets, colors): + vals = np.array([all_ops[op].get(alg, {}).get("speedup", 0) for op in ops_sorted]) + ci_lo = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[0] for op in ops_sorted]) + ci_hi = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[1] for op in ops_sorted]) + yerr = np.array([vals - ci_lo, ci_hi - vals]) + mask = vals > 0 + + ax.bar(x[mask] + offset, vals[mask], bar_w, + label=ALG_TITLES[alg], color=color, alpha=0.85, zorder=3) + ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask], + fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4) + + ax.set_yscale("log") + ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×")) + ax.set_xticks(x) + ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9) + ax.set_ylabel("Speedup ref → avx2 (×, log scale)") + ax.set_title( + "Hand-Written SIMD Speedup over Compiler-Optimised C\n" + "(Intel Xeon Platinum 8268, 95% bootstrap CI, n≥2000 per group)" + ) + ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0) + ax.set_axisbelow(True) + ax.legend(fontsize=10) + + fig.tight_layout() + _save(fig, out_dir, "hand_simd_speedup") + + +# --------------------------------------------------------------------------- +# Figure 3: Cliff's delta heatmap (ref→avx2) +# --------------------------------------------------------------------------- + +def fig_cliffs_heatmap(results: list[dict], out_dir: Path) -> None: + ops_set = sorted( + {r["operation"] for r in results if "ref_to_avx2" in r["comparisons"]}, + key=lambda op: -max( + r["comparisons"]["ref_to_avx2"]["cliffs_delta"] + for r in results + if r["operation"] == op and "ref_to_avx2" in r["comparisons"] + ), + ) + short_ops = [OP_SHORT.get(op, op) for op in ops_set] + + data = np.full((len(ALGORITHMS), len(ops_set)), np.nan) + for i, alg in enumerate(ALGORITHMS): + for j, op in enumerate(ops_set): + match = [r for r in results if r["algorithm"] == alg and r["operation"] == op] + if match and "ref_to_avx2" in match[0]["comparisons"]: + data[i, j] = match[0]["comparisons"]["ref_to_avx2"]["cliffs_delta"] + + n_ops = len(ops_set) + fig, ax = plt.subplots(figsize=(max(10, n_ops * 0.85), 3.2)) + im = ax.imshow(data, aspect="auto", cmap="RdYlGn", vmin=-1, vmax=1) + plt.colorbar(im, ax=ax, label="Cliff's δ", fraction=0.03, pad=0.02) + + ax.set_yticks(range(len(ALGORITHMS))) + ax.set_yticklabels([ALG_TITLES[a] for a in ALGORITHMS], fontsize=10) + ax.set_xticks(range(n_ops)) + ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9) + ax.set_title( + "Cliff's δ (ref vs. avx2) δ = +1.00: avx2 strictly faster in every observation pair", + fontsize=10, + ) + + for i in range(len(ALGORITHMS)): + for j in range(n_ops): + if not np.isnan(data[i, j]): + # White text on dark green cells, black elsewhere + text_color = "white" if data[i, j] > 0.85 else "black" + ax.text(j, i, f"{data[i, j]:+.3f}", ha="center", va="center", + fontsize=9, color=text_color, fontweight="bold") + + fig.tight_layout() + _save(fig, out_dir, "cliffs_delta_heatmap") + + +# --------------------------------------------------------------------------- +# Figure 4: cycle distribution overlays (requires raw aggregator JSON) +# +# Three panels: one high-speedup op, one medium, one low. +# Each panel overlays ref and avx2 histograms + KDE for mlkem512. +# Log x-axis exposes the scale difference honestly. +# --------------------------------------------------------------------------- + +def fig_distributions(raw_records: list[dict], out_dir: Path, alg: str = "mlkem512") -> None: + from scipy.stats import gaussian_kde + + # Build lookup: (alg, variant, op) → raw array + raw: dict[tuple, np.ndarray] = {} + for r in raw_records: + if r.get("raw"): + raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64) + + n_ops = len(DIST_OPS) + fig, axes = plt.subplots(1, n_ops, figsize=(5 * n_ops, 4)) + + variant_style = { + "ref": {"color": "#4C72B0", "label": "ref (O3)", "alpha": 0.55, "zorder": 2}, + "avx2": {"color": "#C44E52", "label": "avx2", "alpha": 0.65, "zorder": 3}, + } + + for ax, (op, subtitle) in zip(axes, DIST_OPS): + plotted_any = False + for variant in ("ref", "avx2"): + arr = raw.get((alg, variant, op)) + if arr is None: + continue + plotted_any = True + s = variant_style[variant] + + # Histogram on log scale + log_arr = np.log10(arr) + lo, hi = np.floor(log_arr.min()), np.ceil(log_arr.max()) + bins = np.logspace(lo, hi, 60) + ax.hist(arr, bins=bins, density=True, color=s["color"], + alpha=s["alpha"], zorder=s["zorder"], label=s["label"]) + + # KDE on log scale, back-transformed + kde = gaussian_kde(log_arr, bw_method=0.25) + xs_log = np.linspace(lo, hi, 400) + xs = 10 ** xs_log + # KDE is in log space; convert density: p(x) = p(log x) / (x ln10) + ys = kde(xs_log) / (xs * np.log(10)) + ax.plot(xs, ys, color=s["color"], linewidth=1.8, zorder=s["zorder"] + 1) + + # Median line + med = float(np.median(arr)) + ax.axvline(med, color=s["color"], linewidth=1.2, linestyle="--", zorder=5) + + if not plotted_any: + ax.set_visible(False) + continue + + ax.set_xscale("log") + ax.set_xlabel("Cycles (log scale)") + ax.set_ylabel("Density" if op == DIST_OPS[0][0] else "") + ax.set_title(subtitle, fontsize=10) + ax.legend(fontsize=9) + ax.xaxis.set_major_formatter(ticker.LogFormatterSciNotation(labelOnlyBase=False)) + ax.grid(axis="x", which="both", linestyle="--", linewidth=0.4, alpha=0.4) + ax.set_axisbelow(True) + + fig.suptitle( + f"Cycle Count Distributions — ref vs. avx2 ({ALG_TITLES[alg]})\n" + "Dashed lines show medians. Distributions are right-skewed → nonparametric statistics.", + fontsize=10, + ) + fig.tight_layout() + _save(fig, out_dir, "distributions") + + +# --------------------------------------------------------------------------- +# Figure 5: cross-param speedup consistency +# +# For per-polynomial operations the polynomial dimension is always 256, +# independent of the security parameter k. Speedups should be identical +# across mlkem512/768/1024. This figure verifies that. +# --------------------------------------------------------------------------- + +def fig_cross_param(results: list[dict], out_dir: Path) -> None: + ops = CROSS_PARAM_OPS + short = [OP_SHORT.get(op, op) for op in ops] + x = np.arange(len(ops)) + bar_w = 0.22 + offsets = np.array([-bar_w, 0, bar_w]) + colors = ["#4C72B0", "#55A868", "#C44E52"] + + fig, ax = plt.subplots(figsize=(8, 4)) + + for alg, offset, color in zip(ALGORITHMS, offsets, colors): + vals, ci_lo, ci_hi = [], [], [] + for op in ops: + match = [r for r in results if r["algorithm"] == alg and r["operation"] == op] + if match and "ref_to_avx2" in match[0]["comparisons"]: + c = match[0]["comparisons"]["ref_to_avx2"] + vals.append(c["speedup"]) + ci_lo.append(c["ci95"][0]) + ci_hi.append(c["ci95"][1]) + else: + vals.append(0); ci_lo.append(0); ci_hi.append(0) + + vals = np.array(vals) + ci_lo = np.array(ci_lo) + ci_hi = np.array(ci_hi) + yerr = np.array([vals - ci_lo, ci_hi - vals]) + mask = vals > 0 + + ax.bar(x[mask] + offset, vals[mask], bar_w, + label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3) + ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask], + fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4) + + ax.set_xticks(x) + ax.set_xticklabels(short, fontsize=11) + ax.set_ylabel("Speedup ref → avx2 (×)") + ax.set_title( + "Per-Polynomial Operation Speedup Across Security Parameters\n" + "(polynomial dim = 256 for all; NTT variation attributed to cache-state differences)" + ) + ax.yaxis.set_minor_locator(ticker.AutoMinorLocator()) + ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0) + ax.set_axisbelow(True) + ax.legend(fontsize=10) + + fig.tight_layout() + _save(fig, out_dir, "cross_param") + + +# --------------------------------------------------------------------------- +# Figure S1: KEM-level end-to-end speedup (supplementary) +# --------------------------------------------------------------------------- + +def fig_kem_level(results: list[dict], out_dir: Path) -> None: + ops = KEM_OPS + short = [KEM_SHORT[op] for op in ops] + x = np.arange(len(ops)) + bar_w = 0.22 + offsets = np.array([-bar_w, 0, bar_w]) + colors = ["#4C72B0", "#55A868", "#C44E52"] + + fig, ax = plt.subplots(figsize=(7, 4)) + + for alg, offset, color in zip(ALGORITHMS, offsets, colors): + vals, ci_lo, ci_hi = [], [], [] + for op in ops: + match = [r for r in results if r["algorithm"] == alg and r["operation"] == op] + if match and "ref_to_avx2" in match[0]["comparisons"]: + c = match[0]["comparisons"]["ref_to_avx2"] + vals.append(c["speedup"]) + ci_lo.append(c["ci95"][0]) + ci_hi.append(c["ci95"][1]) + else: + vals.append(0); ci_lo.append(0); ci_hi.append(0) + + vals = np.array(vals) + ci_lo = np.array(ci_lo) + ci_hi = np.array(ci_hi) + yerr = np.array([vals - ci_lo, ci_hi - vals]) + mask = vals > 0 + + ax.bar(x[mask] + offset, vals[mask], bar_w, + label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3) + ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask], + fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4) + + ax.set_xticks(x) + ax.set_xticklabels(short, fontsize=12) + ax.set_ylabel("Speedup ref → avx2 (×)") + ax.set_title( + "End-to-End KEM Speedup (ref → avx2)\n" + "(Intel Xeon Platinum 8268, 95% bootstrap CI)" + ) + ax.yaxis.set_minor_locator(ticker.AutoMinorLocator()) + ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0) + ax.set_axisbelow(True) + ax.legend(fontsize=10) + + fig.tight_layout() + _save(fig, out_dir, "kem_level") + + +# --------------------------------------------------------------------------- +# Shared save helper +# --------------------------------------------------------------------------- + +def _save(fig: plt.Figure, out_dir: Path, stem: str) -> None: + fig.savefig(out_dir / f"{stem}.pdf", bbox_inches="tight") + fig.savefig(out_dir / f"{stem}.png", bbox_inches="tight", dpi=150) + print(f"Saved {out_dir}/{stem}.{{pdf,png}}") + plt.close(fig) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--json", default="analysis/results.json", + help="analyzed results JSON (from analyze.py)") + parser.add_argument("--raw-json", default=None, + help="raw aggregator JSON (from aggregate --raw); required for --distributions") + parser.add_argument("--out", default="analysis/figures") + args = parser.parse_args() + + out_dir = Path(args.out) + out_dir.mkdir(parents=True, exist_ok=True) + + results = load(args.json) + print(f"Loaded {len(results)} result rows.") + + fig_decomposition(results, out_dir) + fig_hand_simd(results, out_dir) + fig_cliffs_heatmap(results, out_dir) + fig_cross_param(results, out_dir) + fig_kem_level(results, out_dir) + + if args.raw_json: + raw_records = load(args.raw_json) + print(f"Loaded {len(raw_records)} raw groups for distributions.") + fig_distributions(raw_records, out_dir) + else: + print("Skipping distributions figure (pass --raw-json to enable).") + + +if __name__ == "__main__": + main() diff --git a/analysis/figures/cliffs_delta_heatmap.pdf b/analysis/figures/cliffs_delta_heatmap.pdf new file mode 100644 index 0000000..4b2030e Binary files /dev/null and b/analysis/figures/cliffs_delta_heatmap.pdf differ diff --git a/analysis/figures/cliffs_delta_heatmap.png b/analysis/figures/cliffs_delta_heatmap.png new file mode 100644 index 0000000..cf45e46 Binary files /dev/null and b/analysis/figures/cliffs_delta_heatmap.png differ diff --git a/analysis/figures/cross_param.pdf b/analysis/figures/cross_param.pdf new file mode 100644 index 0000000..a4ab364 Binary files /dev/null and b/analysis/figures/cross_param.pdf differ diff --git a/analysis/figures/cross_param.png b/analysis/figures/cross_param.png new file mode 100644 index 0000000..4d0ade9 Binary files /dev/null and b/analysis/figures/cross_param.png differ diff --git a/analysis/figures/decomposition.pdf b/analysis/figures/decomposition.pdf new file mode 100644 index 0000000..5f13a71 Binary files /dev/null and b/analysis/figures/decomposition.pdf differ diff --git a/analysis/figures/decomposition.png b/analysis/figures/decomposition.png new file mode 100644 index 0000000..cf9a743 Binary files /dev/null and b/analysis/figures/decomposition.png differ diff --git a/analysis/figures/distributions.pdf b/analysis/figures/distributions.pdf new file mode 100644 index 0000000..297adc1 Binary files /dev/null and b/analysis/figures/distributions.pdf differ diff --git a/analysis/figures/distributions.png b/analysis/figures/distributions.png new file mode 100644 index 0000000..a613ad9 Binary files /dev/null and b/analysis/figures/distributions.png differ diff --git a/analysis/figures/hand_simd_speedup.pdf b/analysis/figures/hand_simd_speedup.pdf new file mode 100644 index 0000000..73e1832 Binary files /dev/null and b/analysis/figures/hand_simd_speedup.pdf differ diff --git a/analysis/figures/hand_simd_speedup.png b/analysis/figures/hand_simd_speedup.png new file mode 100644 index 0000000..204d8d2 Binary files /dev/null and b/analysis/figures/hand_simd_speedup.png differ diff --git a/analysis/figures/kem_level.pdf b/analysis/figures/kem_level.pdf new file mode 100644 index 0000000..49d0999 Binary files /dev/null and b/analysis/figures/kem_level.pdf differ diff --git a/analysis/figures/kem_level.png b/analysis/figures/kem_level.png new file mode 100644 index 0000000..3be4ead Binary files /dev/null and b/analysis/figures/kem_level.png differ diff --git a/analysis/go.mod b/analysis/go.mod new file mode 100644 index 0000000..57fa4a1 --- /dev/null +++ b/analysis/go.mod @@ -0,0 +1,3 @@ +module git.levineuwirth.org/neuwirth/where-simd-helps/analysis + +go 1.26.1 diff --git a/analysis/pkg/parse/parse.go b/analysis/pkg/parse/parse.go new file mode 100644 index 0000000..8b6960e --- /dev/null +++ b/analysis/pkg/parse/parse.go @@ -0,0 +1,189 @@ +// Package parse reads pqc-bench .out files produced by the SLURM harness. +// +// Each file contains a SLURM prolog header followed by 1–N "loop spin" blocks. +// Each spin block reports one median+average pair per benchmarked operation. +package parse + +import ( + "bufio" + "fmt" + "os" + "strconv" + "strings" +) + +// Meta holds the SLURM prolog metadata extracted from the file header. +type Meta struct { + JobID string + JobName string + Node string + StartedAt string + Directory string + // Explicit fields emitted by submit.sh for reliable downstream parsing. + BenchVariant string + BenchParam string + BenchNSpins string +} + +// Measurement is a single operation's reported statistics for one loop spin. +type Measurement struct { + Median int64 + Average int64 +} + +// Run holds everything parsed from one .out file. +type Run struct { + File string + Meta Meta + // Spins[i] maps operation name → measurement for loop spin i+1. + Spins []map[string]Measurement +} + +// ParseFile reads a single .out file and returns a Run. +func ParseFile(path string) (*Run, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + run := &Run{File: path} + scanner := bufio.NewScanner(f) + // Default buffer size is 64KB; lines are short so this is fine. + + var currentSpin map[string]Measurement + var currentOp string + var pendingMedian int64 + inSpin := false + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + // SLURM prolog lines start with ## + if strings.HasPrefix(line, "##") { + parsePrologLine(line, &run.Meta) + continue + } + + // New loop spin + if strings.HasPrefix(line, "Loop spin:") { + if inSpin && currentSpin != nil { + run.Spins = append(run.Spins, currentSpin) + } + currentSpin = make(map[string]Measurement) + currentOp = "" + inSpin = true + continue + } + + if !inSpin { + continue + } + + // Operation name line ends with ':' + if strings.HasSuffix(line, ":") && !strings.HasPrefix(line, "median") && !strings.HasPrefix(line, "average") { + currentOp = strings.TrimSuffix(line, ":") + currentOp = strings.TrimSpace(currentOp) + continue + } + + if currentOp == "" { + continue + } + + if strings.HasPrefix(line, "median:") { + v, err := parseCycles(line) + if err != nil { + return nil, fmt.Errorf("%s: %w", path, err) + } + pendingMedian = v + continue + } + + if strings.HasPrefix(line, "average:") { + avg, err := parseCycles(line) + if err != nil { + return nil, fmt.Errorf("%s: %w", path, err) + } + currentSpin[currentOp] = Measurement{Median: pendingMedian, Average: avg} + currentOp = "" + pendingMedian = 0 + continue + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("%s: %w", path, err) + } + + // Flush last spin + if inSpin && currentSpin != nil { + run.Spins = append(run.Spins, currentSpin) + } + + return run, nil +} + +// parseCycles extracts the integer from lines like "median: 25194 cycles/ticks". +func parseCycles(line string) (int64, error) { + // Format: "