Phase 1

2026-04-05 11:23:21 -04:00 · 2026-04-05 11:23:21 -04:00 · 00ced380f9
parent 7750ae3d8c
commit 00ced380f9
92 changed files with 2681844 additions and 260 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "algorithms/kyber"]
+	path = algorithms/kyber
+	url = https://github.com/pq-crystals/kyber
--- a/algorithms/kyber
+++ b/algorithms/kyber
@ -0,0 +1 @@
+Subproject commit 4768bd37c02f9c40a46cb49d4d1f4d5e612bb882
--- a/analysis/analyze.py
+++ b/analysis/analyze.py
@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+"""Statistical analysis of pqc-bench results.
+
+Parses .out files via the Go aggregator, then computes a three-way
+decomposition of where speedup originates:
+
+    refo0 → refnv   compiler optimisation (O3, no vectorisation)
+    refnv → ref     compiler auto-vectorisation
+    ref   → avx2    hand-written SIMD
+
+Usage:
+    # Run aggregator inline:
+    python3 analysis/analyze.py --data data/raw/kyber
+
+    # Or pre-generate the raw JSON once, then reuse it:
+    go run ./analysis/cmd/aggregate --raw --out /tmp/bench.json data/raw/kyber
+    python3 analysis/analyze.py --json /tmp/bench.json
+
+    # Write JSON output for figure generation:
+    python3 analysis/analyze.py --data data/raw/kyber --out analysis/results.json
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+from scipy import stats as scipy_stats
+
+# ---------------------------------------------------------------------------
+# Data loading
+# ---------------------------------------------------------------------------
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+
+
+def load_json(path: str) -> list[dict]:
+    with open(path) as f:
+        return json.load(f)
+
+
+def run_aggregator(data_dir: str) -> list[dict]:
+    """Run the Go aggregator and return parsed records."""
+    cmd = ["go", "run", "./cmd/aggregate", "--raw", data_dir]
+    result = subprocess.run(cmd, capture_output=True, text=True, cwd=REPO_ROOT / "analysis")
+    if result.returncode != 0:
+        print(f"aggregator failed:\n{result.stderr}", file=sys.stderr)
+        sys.exit(1)
+    return json.loads(result.stdout)
+
+
+# ---------------------------------------------------------------------------
+# Statistics
+# ---------------------------------------------------------------------------
+
+def cliffs_delta_from_u(u: float, m: int, n: int) -> float:
+    """Cliff's delta derived from Mann-Whitney U statistic.
+
+    U = number of pairs (faster_i, baseline_j) where faster_i < baseline_j.
+    delta = (2U - m*n) / (m*n)  ∈ [-1, +1]
+    Positive → faster dominates baseline.
+    """
+    return (2 * u - m * n) / (m * n)
+
+
+def bootstrap_speedup_ci(
+    baseline: np.ndarray,
+    faster: np.ndarray,
+    n_boot: int = 5_000,
+    ci: float = 0.95,
+    rng: np.random.Generator | None = None,
+) -> tuple[float, float]:
+    """95% bootstrap CI for speedup = median(baseline) / median(faster).
+
+    Resamples both arrays independently using vectorised indexing; returns (lo, hi).
+    """
+    if rng is None:
+        rng = np.random.default_rng(42)
+    m, n = len(baseline), len(faster)
+    # Draw all indices at once: shape (n_boot, m) and (n_boot, n)
+    bi = rng.integers(0, m, size=(n_boot, m))
+    fi = rng.integers(0, n, size=(n_boot, n))
+    b_samples = baseline[bi]   # (n_boot, m)
+    f_samples = faster[fi]     # (n_boot, n)
+    # Median along axis=1 for each bootstrap replicate
+    ratios = np.median(b_samples, axis=1) / np.median(f_samples, axis=1)
+    alpha = (1 - ci) / 2
+    return float(np.percentile(ratios, alpha * 100)), float(np.percentile(ratios, (1 - alpha) * 100))
+
+
+def compare(baseline: np.ndarray, faster: np.ndarray, rng: np.random.Generator) -> dict:
+    """Full pairwise comparison: speedup + CI + Mann-Whitney + Cliff's delta."""
+    speedup = float(np.median(baseline)) / float(np.median(faster))
+    ci_lo, ci_hi = bootstrap_speedup_ci(baseline, faster, rng=rng)
+
+    # One-sided Mann-Whitney: is faster < baseline in cycle counts?
+    m, n = len(faster), len(baseline)
+    u_stat, p_val = scipy_stats.mannwhitneyu(faster, baseline, alternative="less")
+
+    # Cliff's delta derived from U — O(n log n), same cost as Mann-Whitney
+    delta = cliffs_delta_from_u(float(u_stat), m, n)
+
+    return {
+        "speedup": speedup,
+        "ci95": [ci_lo, ci_hi],
+        "mannwhitney_p": float(p_val),
+        "cliffs_delta": delta,
+        "n_baseline": n,
+        "n_faster": m,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Analysis
+# ---------------------------------------------------------------------------
+
+VARIANTS = ("refo0", "refnv", "ref", "avx2")
+
+# Canonical operation order for display
+OP_ORDER = [
+    "NTT", "INVNTT", "basemul", "frommsg",
+    "gen_a", "poly_getnoise_eta1", "poly_getnoise_eta2",
+    "keygen", "enc", "dec",
+]
+
+
+def analyze(records: list[dict]) -> list[dict]:
+    # Build lookup: (algorithm, variant, operation) → raw array
+    raw: dict[tuple[str, str, str], np.ndarray] = {}
+    for r in records:
+        if r.get("raw"):
+            raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64)
+
+    # Collect all (algorithm, operation) pairs present across all variants
+    alg_ops = sorted(
+        {(alg, op) for alg, var, op in raw},
+        key=lambda x: (x[0], _op_rank(x[1])),
+    )
+
+    rng = np.random.default_rng(42)
+    results = []
+    for alg, op in alg_ops:
+        arrays = {v: raw[(alg, v, op)] for v in VARIANTS if (alg, v, op) in raw}
+
+        if len(arrays) < 2:
+            continue
+
+        row: dict = {
+            "algorithm": alg,
+            "operation": op,
+            "medians": {v: float(np.median(a)) for v, a in arrays.items()},
+            "n_obs": {v: len(a) for v, a in arrays.items()},
+            "comparisons": {},
+        }
+
+        comps = row["comparisons"]
+
+        # Three-way decomposition (each step requires both variants present)
+        if "refo0" in arrays and "refnv" in arrays:
+            comps["refo0_to_refnv"] = compare(arrays["refo0"], arrays["refnv"], rng)
+
+        if "refnv" in arrays and "ref" in arrays:
+            comps["refnv_to_ref"] = compare(arrays["refnv"], arrays["ref"], rng)
+
+        if "ref" in arrays and "avx2" in arrays:
+            comps["ref_to_avx2"] = compare(arrays["ref"], arrays["avx2"], rng)
+
+        # Totals
+        if "refo0" in arrays and "ref" in arrays:
+            comps["refo0_to_ref"] = compare(arrays["refo0"], arrays["ref"], rng)
+
+        if "refo0" in arrays and "avx2" in arrays:
+            comps["refo0_to_avx2"] = compare(arrays["refo0"], arrays["avx2"], rng)
+
+        results.append(row)
+
+    return results
+
+
+def _op_rank(op: str) -> int:
+    try:
+        return OP_ORDER.index(op)
+    except ValueError:
+        return len(OP_ORDER)
+
+
+# ---------------------------------------------------------------------------
+# Display
+# ---------------------------------------------------------------------------
+
+def _fmt_speedup(comp: dict | None) -> str:
+    if comp is None:
+        return "   —   "
+    r = comp["speedup"]
+    lo, hi = comp["ci95"]
+    return f"{r:5.2f}x [{lo:.2f},{hi:.2f}]"
+
+
+def _fmt_delta(comp: dict | None) -> str:
+    if comp is None:
+        return "   —"
+    return f"{comp['cliffs_delta']:+.3f}"
+
+
+def _fmt_p(comp: dict | None) -> str:
+    if comp is None:
+        return "       —"
+    p = comp["mannwhitney_p"]
+    if p < 1e-300:
+        return "  <1e-300"
+    if p < 1e-10:
+        return f"  {p:.1e}"
+    return f"  {p:.4f}"
+
+
+def print_table(results: list[dict]) -> None:
+    algs = sorted({r["algorithm"] for r in results})
+
+    for alg in algs:
+        rows = [r for r in results if r["algorithm"] == alg]
+        rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0))
+
+        print(f"\n{'─'*110}")
+        print(f"  {alg.upper()}")
+        print(f"{'─'*110}")
+        print(
+            f"  {'Operation':<24}"
+            f"  {'O3 (no-vec)':>18}"      # refo0→refnv
+            f"  {'Auto-vec':>18}"          # refnv→ref
+            f"  {'Hand SIMD':>18}"         # ref→avx2
+            f"  {'Total':>18}"             # refo0→avx2
+            f"  {'Cliff δ':>7}"
+            f"  {'p-value':>9}"
+        )
+        print(f"  {'':─<24}  {'':─<18}  {'':─<18}  {'':─<18}  {'':─<18}  {'':─<7}  {'':─<9}")
+
+        for r in rows:
+            c = r["comparisons"]
+            print(
+                f"  {r['operation']:<24}"
+                f"  {_fmt_speedup(c.get('refo0_to_refnv')):>18}"
+                f"  {_fmt_speedup(c.get('refnv_to_ref')):>18}"
+                f"  {_fmt_speedup(c.get('ref_to_avx2')):>18}"
+                f"  {_fmt_speedup(c.get('refo0_to_avx2')):>18}"
+                f"  {_fmt_delta(c.get('ref_to_avx2')):>7}"
+                f"  {_fmt_p(c.get('ref_to_avx2')):>9}"
+            )
+
+    print(f"\n{'─'*110}")
+    print("  Speedup = median(baseline) / median(variant); CI: 95% bootstrap (5000 iterations)")
+    print("  Cliff δ and p-value are for ref → avx2 comparison (H1: avx2 cycles < ref cycles)")
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Statistical analysis of pqc-bench results")
+    src = parser.add_mutually_exclusive_group(required=True)
+    src.add_argument("--data", metavar="DIR", help="data directory (runs Go aggregator)")
+    src.add_argument("--json", metavar="FILE", help="pre-generated aggregate JSON with --raw")
+    parser.add_argument("--out", metavar="FILE", help="write analysis JSON to this file")
+    args = parser.parse_args()
+
+    if args.json:
+        records = load_json(args.json)
+        print(f"Loaded {len(records)} groups from {args.json}.", file=sys.stderr)
+    else:
+        print("Running aggregator...", file=sys.stderr)
+        records = run_aggregator(args.data)
+        print(f"Loaded {len(records)} groups.", file=sys.stderr)
+
+    results = analyze(records)
+    print_table(results)
+
+    if args.out:
+        with open(args.out, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"\nWrote analysis JSON to {args.out}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/analysis/cmd/aggregate/main.go
+++ b/analysis/cmd/aggregate/main.go
@ -0,0 +1,215 @@
+// aggregate parses pqc-bench .out files and emits summary statistics as JSON.
+//
+// Usage:
+//
+//	aggregate [--raw] [--out results.json] <data-dir>
+//
+// It walks <data-dir> for all *.out files, grouping results by the parent
+// directory name (algorithm) and the variant inferred from the SLURM header.
+// Output is a JSON array of result objects, one per (algorithm, variant,
+// operation) triple.
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"git.levineuwirth.org/neuwirth/where-simd-helps/analysis/pkg/parse"
+	"git.levineuwirth.org/neuwirth/where-simd-helps/analysis/pkg/stats"
+)
+
+// Result is one output record: all statistics for a single
+// (algorithm, variant, operation) group.
+type Result struct {
+	Algorithm    string   `json:"algorithm"`
+	Variant      string   `json:"variant"`
+	Operation    string   `json:"operation"`
+	Unit         string   `json:"unit"`
+	NObservations int     `json:"n_observations"`
+	NRuns        int      `json:"n_runs"`
+	Median       float64  `json:"median"`
+	Mean         float64  `json:"mean"`
+	Std          float64  `json:"std"`
+	MAD          float64  `json:"mad"`
+	P5           float64  `json:"p5"`
+	P25          float64  `json:"p25"`
+	P75          float64  `json:"p75"`
+	P95          float64  `json:"p95"`
+	P99          float64  `json:"p99"`
+	CI95         [2]float64 `json:"ci95"`
+	Node         string   `json:"node"`
+	Sources      []string `json:"sources"`
+	Raw          []int64  `json:"raw,omitempty"`
+}
+
+// groupKey uniquely identifies a (algorithm, variant, operation) combination.
+type groupKey struct {
+	algorithm, variant, operation string
+}
+
+func main() {
+	rawFlag := flag.Bool("raw", false, "include per-observation cycle counts in output")
+	outFlag := flag.String("out", "", "write JSON output to this file instead of stdout")
+	flag.Usage = func() {
+		fmt.Fprintf(os.Stderr, "Usage: aggregate [--raw] [--out FILE] <data-dir>\n")
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+
+	if flag.NArg() != 1 {
+		flag.Usage()
+		os.Exit(1)
+	}
+	dataDir := flag.Arg(0)
+
+	// Collect all .out files.
+	var outFiles []string
+	err := filepath.WalkDir(dataDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		if !d.IsDir() && strings.HasSuffix(path, ".out") {
+			outFiles = append(outFiles, path)
+		}
+		return nil
+	})
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error walking %s: %v\n", dataDir, err)
+		os.Exit(1)
+	}
+	if len(outFiles) == 0 {
+		fmt.Fprintf(os.Stderr, "no .out files found under %s\n", dataDir)
+		os.Exit(1)
+	}
+
+	// Parse every file and accumulate observations per group.
+	type accumulator struct {
+		values  []int64
+		sources []string
+		node    string
+	}
+	groups := make(map[groupKey]*accumulator)
+
+	for _, path := range outFiles {
+		run, err := parse.ParseFile(path)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "warning: skipping %s: %v\n", path, err)
+			continue
+		}
+
+		algorithm := inferAlgorithm(run.Meta, path)
+		variant := parse.InferVariant(run.Meta)
+
+		for _, spin := range run.Spins {
+			for op, m := range spin {
+				key := groupKey{algorithm, variant, op}
+				acc := groups[key]
+				if acc == nil {
+					acc = &accumulator{node: run.Meta.Node}
+					groups[key] = acc
+				}
+				acc.values = append(acc.values, m.Median)
+			}
+		}
+
+		// Record sources per group (any key with this algorithm+variant).
+		for key, acc := range groups {
+			if key.algorithm == algorithm && key.variant == variant {
+				if !slices.Contains(acc.sources, path) {
+					acc.sources = append(acc.sources, path)
+				}
+			}
+		}
+	}
+
+	// Build results.
+	results := make([]Result, 0, len(groups))
+	for key, acc := range groups {
+		sorted := make([]int64, len(acc.values))
+		copy(sorted, acc.values)
+		stats.SortInt64(sorted)
+
+		s := stats.Compute(sorted)
+
+		r := Result{
+			Algorithm:     key.algorithm,
+			Variant:       key.variant,
+			Operation:     key.operation,
+			Unit:          "cycles",
+			NObservations: s.N,
+			NRuns:         len(acc.sources),
+			Median:        s.Median,
+			Mean:          s.Mean,
+			Std:           s.Std,
+			MAD:           s.MAD,
+			P5:            s.P5,
+			P25:           s.P25,
+			P75:           s.P75,
+			P95:           s.P95,
+			P99:           s.P99,
+			CI95:          s.CI95,
+			Node:          acc.node,
+			Sources:       acc.sources,
+		}
+		if *rawFlag {
+			r.Raw = acc.values
+		}
+		results = append(results, r)
+	}
+
+	// Sort for stable output: algorithm → variant → operation.
+	slices.SortFunc(results, func(a, b Result) int {
+		if a.Algorithm != b.Algorithm {
+			return strings.Compare(a.Algorithm, b.Algorithm)
+		}
+		if a.Variant != b.Variant {
+			return strings.Compare(a.Variant, b.Variant)
+		}
+		return strings.Compare(a.Operation, b.Operation)
+	})
+
+	out, err := json.MarshalIndent(results, "", "  ")
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error marshalling JSON: %v\n", err)
+		os.Exit(1)
+	}
+
+	if *outFlag != "" {
+		if err := os.WriteFile(*outFlag, out, 0o644); err != nil {
+			fmt.Fprintf(os.Stderr, "error writing %s: %v\n", *outFlag, err)
+			os.Exit(1)
+		}
+		fmt.Fprintf(os.Stderr, "wrote %d results to %s\n", len(results), *outFlag)
+	} else {
+		fmt.Println(string(out))
+	}
+}
+
+// inferAlgorithm returns the algorithm name (e.g. "mlkem512") for a run.
+//
+// Priority:
+//  1. BENCH_PARAM metadata → "mlkem{PARAM}" (new-style runs via submit.sh)
+//  2. Walk the file path upward for a segment matching "mlkem\d+" (handles
+//     both flat old-style layout and new nested layout transparently)
+//  3. The immediate parent directory name as a last resort.
+func inferAlgorithm(meta parse.Meta, filePath string) string {
+	if meta.BenchParam != "" {
+		return "mlkem" + meta.BenchParam
+	}
+	// Walk path components looking for mlkem\d+.
+	dir := filepath.Dir(filePath)
+	for dir != "." && dir != "/" {
+		base := filepath.Base(dir)
+		if strings.HasPrefix(base, "mlkem") {
+			return base
+		}
+		dir = filepath.Dir(dir)
+	}
+	return filepath.Base(filepath.Dir(filePath))
+}
--- a/analysis/cmd/analyze-simd/main.go
+++ b/analysis/cmd/analyze-simd/main.go
@ -0,0 +1,242 @@
+// analyze-simd computes speedup ratios from aggregated pqc-bench results.
+//
+// Usage:
+//
+//	analyze-simd [--baseline ref] [--in results.json] [--out speedups.json]
+//
+// It reads the JSON produced by 'aggregate', computes per-operation speedups
+// relative to the baseline variant, and emits both a human-readable table
+// and a structured JSON file suitable for downstream plotting.
+package main
+
+import (
+	"cmp"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"math"
+	"os"
+	"slices"
+	"strings"
+	"text/tabwriter"
+)
+
+// Record mirrors the aggregate output schema (fields we need).
+type Record struct {
+	Algorithm string     `json:"algorithm"`
+	Variant   string     `json:"variant"`
+	Operation string     `json:"operation"`
+	Median    float64    `json:"median"`
+	CI95      [2]float64 `json:"ci95"`
+	NRuns     int        `json:"n_runs"`
+}
+
+// Speedup is one variant-vs-baseline comparison for a single (algorithm, operation).
+type Speedup struct {
+	Variant   string     `json:"variant"`
+	Median    float64    `json:"median"`
+	Speedup   float64    `json:"speedup"`
+	SpeedupCI [2]float64 `json:"speedup_ci95"`
+}
+
+// Result is one output row: all comparisons for one (algorithm, operation) pair.
+type Result struct {
+	Algorithm       string    `json:"algorithm"`
+	Operation       string    `json:"operation"`
+	BaselineVariant string    `json:"baseline_variant"`
+	BaselineMedian  float64   `json:"baseline_median"`
+	BaselineCI95    [2]float64 `json:"baseline_ci95"`
+	Comparisons     []Speedup `json:"comparisons"`
+}
+
+func main() {
+	baseline := flag.String("baseline", "ref", "variant to use as the speedup denominator")
+	inFile   := flag.String("in", "results/kyber.json", "input JSON from aggregate")
+	outFile  := flag.String("out", "", "write speedup JSON to this file (default: stdout)")
+	flag.Usage = func() {
+		fmt.Fprintf(os.Stderr, "Usage: analyze-simd [--baseline VARIANT] [--in FILE] [--out FILE]\n")
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+
+	raw, err := os.ReadFile(*inFile)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error reading %s: %v\n", *inFile, err)
+		os.Exit(1)
+	}
+	var records []Record
+	if err := json.Unmarshal(raw, &records); err != nil {
+		fmt.Fprintf(os.Stderr, "error parsing JSON: %v\n", err)
+		os.Exit(1)
+	}
+
+	// Index by (algorithm, variant, operation).
+	type key struct{ algorithm, variant, operation string }
+	idx := make(map[key]Record, len(records))
+	for _, r := range records {
+		idx[key{r.Algorithm, r.Variant, r.Operation}] = r
+	}
+
+	// Collect sorted unique values for stable output.
+	algorithms := unique(records, func(r Record) string { return r.Algorithm })
+	operations := unique(records, func(r Record) string { return r.Operation })
+	variants   := unique(records, func(r Record) string { return r.Variant })
+	// Remove baseline from comparison variants.
+	variants = slices.DeleteFunc(variants, func(v string) bool { return v == *baseline })
+
+	// Build results.
+	var results []Result
+	for _, alg := range algorithms {
+		for _, op := range operations {
+			baseRec, ok := idx[key{alg, *baseline, op}]
+			if !ok || baseRec.Median == 0 {
+				continue
+			}
+			res := Result{
+				Algorithm:       alg,
+				Operation:       op,
+				BaselineVariant: *baseline,
+				BaselineMedian:  baseRec.Median,
+				BaselineCI95:    baseRec.CI95,
+			}
+			for _, v := range variants {
+				cmpRec, ok := idx[key{alg, v, op}]
+				if !ok || cmpRec.Median == 0 {
+					continue
+				}
+				sp := baseRec.Median / cmpRec.Median
+				// Conservative CI: ratio of interval bounds.
+				// speedup_lo = baseline_lo / cmp_hi
+				// speedup_hi = baseline_hi / cmp_lo
+				var spCI [2]float64
+				if cmpRec.CI95[1] > 0 {
+					spCI[0] = safeDiv(baseRec.CI95[0], cmpRec.CI95[1])
+				}
+				if cmpRec.CI95[0] > 0 {
+					spCI[1] = safeDiv(baseRec.CI95[1], cmpRec.CI95[0])
+				}
+				res.Comparisons = append(res.Comparisons, Speedup{
+					Variant:   v,
+					Median:    cmpRec.Median,
+					Speedup:   sp,
+					SpeedupCI: spCI,
+				})
+			}
+			if len(res.Comparisons) > 0 {
+				results = append(results, res)
+			}
+		}
+	}
+
+	// Print human-readable table to stderr.
+	printTable(os.Stderr, results, variants, *baseline)
+
+	// Emit JSON.
+	out, err := json.MarshalIndent(results, "", "  ")
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error marshalling JSON: %v\n", err)
+		os.Exit(1)
+	}
+	if *outFile != "" {
+		if err := os.WriteFile(*outFile, out, 0o644); err != nil {
+			fmt.Fprintf(os.Stderr, "error writing %s: %v\n", *outFile, err)
+			os.Exit(1)
+		}
+		fmt.Fprintf(os.Stderr, "wrote %d results to %s\n", len(results), *outFile)
+	} else {
+		fmt.Println(string(out))
+	}
+}
+
+func printTable(w *os.File, results []Result, variants []string, baseline string) {
+	tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0)
+
+	// Group by algorithm.
+	byAlg := make(map[string][]Result)
+	for _, r := range results {
+		byAlg[r.Algorithm] = append(byAlg[r.Algorithm], r)
+	}
+	algs := make([]string, 0, len(byAlg))
+	for a := range byAlg {
+		algs = append(algs, a)
+	}
+	slices.Sort(algs)
+
+	for _, alg := range algs {
+		fmt.Fprintf(tw, "\n── %s (baseline: %s) ──\n", strings.ToUpper(alg), baseline)
+
+		// Header.
+		var hdr strings.Builder
+		fmt.Fprintf(&hdr, "%-38s\t%12s", "operation", baseline+"(cycles)")
+		for _, v := range variants {
+			fmt.Fprintf(&hdr, "\t%10s", v)
+		}
+		fmt.Fprintln(tw, hdr.String())
+		fmt.Fprintln(tw, strings.Repeat("-", 38+13+11*len(variants)))
+
+		rows := byAlg[alg]
+		slices.SortFunc(rows, func(a, b Result) int {
+			// Sort by descending avx2 speedup if available, else alphabetically.
+			sa := speedupFor(a, "avx2")
+			sb := speedupFor(b, "avx2")
+			if sa != sb {
+				return cmp.Compare(sb, sa) // descending
+			}
+			return strings.Compare(a.Operation, b.Operation)
+		})
+
+		for _, r := range rows {
+			var line strings.Builder
+			fmt.Fprintf(&line, "%-38s\t%12s", r.Operation, formatCycles(r.BaselineMedian))
+			for _, v := range variants {
+				sp := speedupFor(r, v)
+				if math.IsNaN(sp) {
+					fmt.Fprintf(&line, "\t%10s", "---")
+				} else {
+					fmt.Fprintf(&line, "\t%9.2fx", sp)
+				}
+			}
+			fmt.Fprintln(tw, line.String())
+		}
+	}
+	tw.Flush()
+}
+
+func speedupFor(r Result, variant string) float64 {
+	for _, c := range r.Comparisons {
+		if c.Variant == variant {
+			return c.Speedup
+		}
+	}
+	return math.NaN()
+}
+
+func formatCycles(c float64) string {
+	if c >= 1_000_000 {
+		return fmt.Sprintf("%.2fM", c/1_000_000)
+	}
+	if c >= 1_000 {
+		return fmt.Sprintf("%.1fK", c/1_000)
+	}
+	return fmt.Sprintf("%.0f", c)
+}
+
+func safeDiv(a, b float64) float64 {
+	if b == 0 {
+		return 0
+	}
+	return a / b
+}
+
+func unique(records []Record, fn func(Record) string) []string {
+	seen := make(map[string]struct{})
+	for _, r := range records {
+		seen[fn(r)] = struct{}{}
+	}
+	out := make([]string, 0, len(seen))
+	for k := range seen {
+		out = append(out, k)
+	}
+	slices.Sort(out)
+	return out
+}
--- a/analysis/figures.py
+++ b/analysis/figures.py
@ -0,0 +1,487 @@
+#!/usr/bin/env python3
+"""Matplotlib draft figures for the PQC SIMD speedup analysis.
+
+Usage:
+    python3 analysis/figures.py [--json analysis/results.json] [--out figures/]
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import numpy as np
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+
+# Cumulative stages used in Figure 1 (each shows total speedup from refo0)
+STAGE_KEYS    = ["refo0_to_refnv", "refo0_to_ref", "refo0_to_avx2"]
+STAGE_LABELS  = ["O3, no auto-vec", "O3 + auto-vec", "O3 + hand SIMD (avx2)"]
+STAGE_COLORS  = ["#4C72B0", "#55A868", "#C44E52"]
+
+# Ops to show in the primary figures (excludes top-level KEM wrappers)
+PRIMARY_OPS = {
+    "poly_frommsg", "INVNTT", "polyvec_basemul_acc_montgomery", "NTT",
+    "indcpa_dec", "polyvec_decompress", "poly_decompress",
+    "poly_compress", "poly_tomsg", "polyvec_compress",
+    "indcpa_enc", "indcpa_keypair", "gen_a",
+    "poly_getnoise_eta1", "poly_getnoise_eta2",
+}
+
+# Short display names
+OP_SHORT = {
+    "poly_frommsg":                     "frommsg",
+    "INVNTT":                           "INVNTT",
+    "polyvec_basemul_acc_montgomery":   "basemul",
+    "NTT":                              "NTT",
+    "indcpa_dec":                       "dec",
+    "polyvec_decompress":               "pvec_decomp",
+    "poly_decompress":                  "poly_decomp",
+    "poly_compress":                    "poly_comp",
+    "poly_tomsg":                       "tomsg",
+    "polyvec_compress":                 "pvec_comp",
+    "indcpa_enc":                       "enc",
+    "indcpa_keypair":                   "keypair",
+    "gen_a":                            "gen_a",
+    "poly_getnoise_eta1":               "noise_η₁",
+    "poly_getnoise_eta2":               "noise_η₂",
+}
+
+ALGORITHMS = ["mlkem512", "mlkem768", "mlkem1024"]
+ALG_TITLES  = {"mlkem512": "ML-KEM-512", "mlkem768": "ML-KEM-768", "mlkem1024": "ML-KEM-1024"}
+
+# Operations selected to illustrate the distribution figure:
+# one high-speedup arithmetic op, one medium SHAKE-bound op, one low-speedup op
+DIST_OPS = [
+    ("INVNTT",            "INVNTT\n(~55× speedup)"),
+    ("gen_a",             "gen_a\n(~4× speedup)"),
+    ("poly_getnoise_eta1","noise η₁\n(~1.3× speedup)"),
+]
+
+# Per-polynomial ops whose speedup should be param-independent
+CROSS_PARAM_OPS = [
+    "poly_frommsg",
+    "INVNTT",
+    "polyvec_basemul_acc_montgomery",
+    "NTT",
+]
+
+# KEM-level ops for supplementary
+KEM_OPS = ["kyber_keypair", "kyber_encaps", "kyber_decaps"]
+KEM_SHORT = {"kyber_keypair": "KeyGen", "kyber_encaps": "Encaps", "kyber_decaps": "Decaps"}
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def load(json_path: str) -> list[dict]:
+    with open(json_path) as f:
+        return json.load(f)
+
+
+def ops_for_alg(results: list[dict], alg: str) -> list[dict]:
+    rows = [r for r in results if r["algorithm"] == alg and r["operation"] in PRIMARY_OPS]
+    rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0))
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Figure 1: cumulative grouped bars — speedup at each optimisation stage
+#
+# Each group shows three bars for one operation:
+#   refo0→refnv   total speedup with O3, auto-vec OFF
+#   refo0→ref     total speedup with O3, auto-vec ON
+#   refo0→avx2    total speedup with O3 + hand-written SIMD
+#
+# Because all bars share the same baseline (refo0=1), they are directly
+# comparable without any additive/multiplicative ambiguity.
+# ---------------------------------------------------------------------------
+
+def fig_decomposition(results: list[dict], out_dir: Path) -> None:
+    fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=False)
+
+    for ax, alg in zip(axes, ALGORITHMS):
+        rows = ops_for_alg(results, alg)
+        if not rows:
+            ax.set_visible(False)
+            continue
+
+        ops   = [OP_SHORT.get(r["operation"], r["operation"]) for r in rows]
+        n     = len(rows)
+        group = np.arange(n)
+        # Three bars per group, evenly spaced within each group slot
+        bar_w = 0.22
+        offsets = np.array([-bar_w, 0, bar_w])
+
+        for (key, label, color), offset in zip(
+            zip(STAGE_KEYS, STAGE_LABELS, STAGE_COLORS), offsets
+        ):
+            vals  = np.array([r["comparisons"].get(key, {}).get("speedup", 0.0) for r in rows])
+            ci_lo = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[0] for r in rows])
+            ci_hi = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[1] for r in rows])
+            yerr  = np.array([vals - ci_lo, ci_hi - vals])
+            mask  = vals > 0
+
+            ax.bar(group[mask] + offset, vals[mask], bar_w,
+                   label=label, color=color, alpha=0.88, zorder=3)
+            ax.errorbar(group[mask] + offset, vals[mask], yerr=yerr[:, mask],
+                        fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4)
+
+        ax.set_yscale("log")
+        ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×"))
+        ax.set_title(ALG_TITLES[alg], fontsize=12, fontweight="bold")
+        ax.set_xticks(group)
+        ax.set_xticklabels(ops, rotation=45, ha="right", fontsize=8)
+        ax.set_ylabel("Speedup over -O0 (×, log scale)" if alg == "mlkem512" else "")
+        ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
+        ax.set_axisbelow(True)
+        ax.set_xlim(-0.5, n - 0.5)
+
+    handles, labels = axes[0].get_legend_handles_labels()
+    fig.legend(handles, labels, loc="upper center", ncol=3,
+               fontsize=10, frameon=True, bbox_to_anchor=(0.5, 1.02))
+    fig.suptitle(
+        "ML-KEM Cumulative Speedup at Each Optimisation Stage  "
+        "(Intel Xeon Platinum 8268, 95% bootstrap CI)",
+        fontsize=11, y=1.06,
+    )
+    fig.tight_layout()
+    _save(fig, out_dir, "decomposition")
+
+
+# ---------------------------------------------------------------------------
+# Figure 2: hand-SIMD speedup (ref→avx2), all algorithms overlaid, log scale
+# ---------------------------------------------------------------------------
+
+def fig_hand_simd(results: list[dict], out_dir: Path) -> None:
+    all_ops: dict[str, dict] = {}
+    for r in results:
+        if r["operation"] in PRIMARY_OPS and "ref_to_avx2" in r["comparisons"]:
+            all_ops.setdefault(r["operation"], {})
+            all_ops[r["operation"]][r["algorithm"]] = r["comparisons"]["ref_to_avx2"]
+
+    ops_sorted = sorted(
+        all_ops,
+        key=lambda op: -all_ops[op].get("mlkem512", {}).get("speedup", 0),
+    )
+    short_ops = [OP_SHORT.get(op, op) for op in ops_sorted]
+
+    x       = np.arange(len(ops_sorted))
+    bar_w   = 0.25
+    offsets = [-bar_w, 0, bar_w]
+    colors  = ["#4C72B0", "#55A868", "#C44E52"]
+
+    fig, ax = plt.subplots(figsize=(14, 5))
+
+    for alg, offset, color in zip(ALGORITHMS, offsets, colors):
+        vals  = np.array([all_ops[op].get(alg, {}).get("speedup", 0) for op in ops_sorted])
+        ci_lo = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[0] for op in ops_sorted])
+        ci_hi = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[1] for op in ops_sorted])
+        yerr  = np.array([vals - ci_lo, ci_hi - vals])
+        mask  = vals > 0
+
+        ax.bar(x[mask] + offset, vals[mask], bar_w,
+               label=ALG_TITLES[alg], color=color, alpha=0.85, zorder=3)
+        ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
+                    fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4)
+
+    ax.set_yscale("log")
+    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×"))
+    ax.set_xticks(x)
+    ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9)
+    ax.set_ylabel("Speedup ref → avx2 (×, log scale)")
+    ax.set_title(
+        "Hand-Written SIMD Speedup over Compiler-Optimised C\n"
+        "(Intel Xeon Platinum 8268, 95% bootstrap CI, n≥2000 per group)"
+    )
+    ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
+    ax.set_axisbelow(True)
+    ax.legend(fontsize=10)
+
+    fig.tight_layout()
+    _save(fig, out_dir, "hand_simd_speedup")
+
+
+# ---------------------------------------------------------------------------
+# Figure 3: Cliff's delta heatmap (ref→avx2)
+# ---------------------------------------------------------------------------
+
+def fig_cliffs_heatmap(results: list[dict], out_dir: Path) -> None:
+    ops_set = sorted(
+        {r["operation"] for r in results if "ref_to_avx2" in r["comparisons"]},
+        key=lambda op: -max(
+            r["comparisons"]["ref_to_avx2"]["cliffs_delta"]
+            for r in results
+            if r["operation"] == op and "ref_to_avx2" in r["comparisons"]
+        ),
+    )
+    short_ops = [OP_SHORT.get(op, op) for op in ops_set]
+
+    data = np.full((len(ALGORITHMS), len(ops_set)), np.nan)
+    for i, alg in enumerate(ALGORITHMS):
+        for j, op in enumerate(ops_set):
+            match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
+            if match and "ref_to_avx2" in match[0]["comparisons"]:
+                data[i, j] = match[0]["comparisons"]["ref_to_avx2"]["cliffs_delta"]
+
+    n_ops = len(ops_set)
+    fig, ax = plt.subplots(figsize=(max(10, n_ops * 0.85), 3.2))
+    im = ax.imshow(data, aspect="auto", cmap="RdYlGn", vmin=-1, vmax=1)
+    plt.colorbar(im, ax=ax, label="Cliff's δ", fraction=0.03, pad=0.02)
+
+    ax.set_yticks(range(len(ALGORITHMS)))
+    ax.set_yticklabels([ALG_TITLES[a] for a in ALGORITHMS], fontsize=10)
+    ax.set_xticks(range(n_ops))
+    ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9)
+    ax.set_title(
+        "Cliff's δ  (ref vs. avx2)     δ = +1.00: avx2 strictly faster in every observation pair",
+        fontsize=10,
+    )
+
+    for i in range(len(ALGORITHMS)):
+        for j in range(n_ops):
+            if not np.isnan(data[i, j]):
+                # White text on dark green cells, black elsewhere
+                text_color = "white" if data[i, j] > 0.85 else "black"
+                ax.text(j, i, f"{data[i, j]:+.3f}", ha="center", va="center",
+                        fontsize=9, color=text_color, fontweight="bold")
+
+    fig.tight_layout()
+    _save(fig, out_dir, "cliffs_delta_heatmap")
+
+
+# ---------------------------------------------------------------------------
+# Figure 4: cycle distribution overlays (requires raw aggregator JSON)
+#
+# Three panels: one high-speedup op, one medium, one low.
+# Each panel overlays ref and avx2 histograms + KDE for mlkem512.
+# Log x-axis exposes the scale difference honestly.
+# ---------------------------------------------------------------------------
+
+def fig_distributions(raw_records: list[dict], out_dir: Path, alg: str = "mlkem512") -> None:
+    from scipy.stats import gaussian_kde
+
+    # Build lookup: (alg, variant, op) → raw array
+    raw: dict[tuple, np.ndarray] = {}
+    for r in raw_records:
+        if r.get("raw"):
+            raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64)
+
+    n_ops = len(DIST_OPS)
+    fig, axes = plt.subplots(1, n_ops, figsize=(5 * n_ops, 4))
+
+    variant_style = {
+        "ref":  {"color": "#4C72B0", "label": "ref (O3)",   "alpha": 0.55, "zorder": 2},
+        "avx2": {"color": "#C44E52", "label": "avx2",       "alpha": 0.65, "zorder": 3},
+    }
+
+    for ax, (op, subtitle) in zip(axes, DIST_OPS):
+        plotted_any = False
+        for variant in ("ref", "avx2"):
+            arr = raw.get((alg, variant, op))
+            if arr is None:
+                continue
+            plotted_any = True
+            s = variant_style[variant]
+
+            # Histogram on log scale
+            log_arr = np.log10(arr)
+            lo, hi = np.floor(log_arr.min()), np.ceil(log_arr.max())
+            bins = np.logspace(lo, hi, 60)
+            ax.hist(arr, bins=bins, density=True, color=s["color"],
+                    alpha=s["alpha"], zorder=s["zorder"], label=s["label"])
+
+            # KDE on log scale, back-transformed
+            kde = gaussian_kde(log_arr, bw_method=0.25)
+            xs_log = np.linspace(lo, hi, 400)
+            xs = 10 ** xs_log
+            # KDE is in log space; convert density: p(x) = p(log x) / (x ln10)
+            ys = kde(xs_log) / (xs * np.log(10))
+            ax.plot(xs, ys, color=s["color"], linewidth=1.8, zorder=s["zorder"] + 1)
+
+            # Median line
+            med = float(np.median(arr))
+            ax.axvline(med, color=s["color"], linewidth=1.2, linestyle="--", zorder=5)
+
+        if not plotted_any:
+            ax.set_visible(False)
+            continue
+
+        ax.set_xscale("log")
+        ax.set_xlabel("Cycles (log scale)")
+        ax.set_ylabel("Density" if op == DIST_OPS[0][0] else "")
+        ax.set_title(subtitle, fontsize=10)
+        ax.legend(fontsize=9)
+        ax.xaxis.set_major_formatter(ticker.LogFormatterSciNotation(labelOnlyBase=False))
+        ax.grid(axis="x", which="both", linestyle="--", linewidth=0.4, alpha=0.4)
+        ax.set_axisbelow(True)
+
+    fig.suptitle(
+        f"Cycle Count Distributions — ref vs. avx2  ({ALG_TITLES[alg]})\n"
+        "Dashed lines show medians. Distributions are right-skewed → nonparametric statistics.",
+        fontsize=10,
+    )
+    fig.tight_layout()
+    _save(fig, out_dir, "distributions")
+
+
+# ---------------------------------------------------------------------------
+# Figure 5: cross-param speedup consistency
+#
+# For per-polynomial operations the polynomial dimension is always 256,
+# independent of the security parameter k. Speedups should be identical
+# across mlkem512/768/1024. This figure verifies that.
+# ---------------------------------------------------------------------------
+
+def fig_cross_param(results: list[dict], out_dir: Path) -> None:
+    ops = CROSS_PARAM_OPS
+    short = [OP_SHORT.get(op, op) for op in ops]
+    x = np.arange(len(ops))
+    bar_w = 0.22
+    offsets = np.array([-bar_w, 0, bar_w])
+    colors = ["#4C72B0", "#55A868", "#C44E52"]
+
+    fig, ax = plt.subplots(figsize=(8, 4))
+
+    for alg, offset, color in zip(ALGORITHMS, offsets, colors):
+        vals, ci_lo, ci_hi = [], [], []
+        for op in ops:
+            match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
+            if match and "ref_to_avx2" in match[0]["comparisons"]:
+                c = match[0]["comparisons"]["ref_to_avx2"]
+                vals.append(c["speedup"])
+                ci_lo.append(c["ci95"][0])
+                ci_hi.append(c["ci95"][1])
+            else:
+                vals.append(0); ci_lo.append(0); ci_hi.append(0)
+
+        vals   = np.array(vals)
+        ci_lo  = np.array(ci_lo)
+        ci_hi  = np.array(ci_hi)
+        yerr   = np.array([vals - ci_lo, ci_hi - vals])
+        mask   = vals > 0
+
+        ax.bar(x[mask] + offset, vals[mask], bar_w,
+               label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3)
+        ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
+                    fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4)
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(short, fontsize=11)
+    ax.set_ylabel("Speedup ref → avx2 (×)")
+    ax.set_title(
+        "Per-Polynomial Operation Speedup Across Security Parameters\n"
+        "(polynomial dim = 256 for all; NTT variation attributed to cache-state differences)"
+    )
+    ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
+    ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
+    ax.set_axisbelow(True)
+    ax.legend(fontsize=10)
+
+    fig.tight_layout()
+    _save(fig, out_dir, "cross_param")
+
+
+# ---------------------------------------------------------------------------
+# Figure S1: KEM-level end-to-end speedup (supplementary)
+# ---------------------------------------------------------------------------
+
+def fig_kem_level(results: list[dict], out_dir: Path) -> None:
+    ops = KEM_OPS
+    short = [KEM_SHORT[op] for op in ops]
+    x = np.arange(len(ops))
+    bar_w = 0.22
+    offsets = np.array([-bar_w, 0, bar_w])
+    colors = ["#4C72B0", "#55A868", "#C44E52"]
+
+    fig, ax = plt.subplots(figsize=(7, 4))
+
+    for alg, offset, color in zip(ALGORITHMS, offsets, colors):
+        vals, ci_lo, ci_hi = [], [], []
+        for op in ops:
+            match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
+            if match and "ref_to_avx2" in match[0]["comparisons"]:
+                c = match[0]["comparisons"]["ref_to_avx2"]
+                vals.append(c["speedup"])
+                ci_lo.append(c["ci95"][0])
+                ci_hi.append(c["ci95"][1])
+            else:
+                vals.append(0); ci_lo.append(0); ci_hi.append(0)
+
+        vals  = np.array(vals)
+        ci_lo = np.array(ci_lo)
+        ci_hi = np.array(ci_hi)
+        yerr  = np.array([vals - ci_lo, ci_hi - vals])
+        mask  = vals > 0
+
+        ax.bar(x[mask] + offset, vals[mask], bar_w,
+               label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3)
+        ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
+                    fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4)
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(short, fontsize=12)
+    ax.set_ylabel("Speedup ref → avx2 (×)")
+    ax.set_title(
+        "End-to-End KEM Speedup (ref → avx2)\n"
+        "(Intel Xeon Platinum 8268, 95% bootstrap CI)"
+    )
+    ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
+    ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
+    ax.set_axisbelow(True)
+    ax.legend(fontsize=10)
+
+    fig.tight_layout()
+    _save(fig, out_dir, "kem_level")
+
+
+# ---------------------------------------------------------------------------
+# Shared save helper
+# ---------------------------------------------------------------------------
+
+def _save(fig: plt.Figure, out_dir: Path, stem: str) -> None:
+    fig.savefig(out_dir / f"{stem}.pdf", bbox_inches="tight")
+    fig.savefig(out_dir / f"{stem}.png", bbox_inches="tight", dpi=150)
+    print(f"Saved {out_dir}/{stem}.{{pdf,png}}")
+    plt.close(fig)
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--json",     default="analysis/results.json",
+                        help="analyzed results JSON (from analyze.py)")
+    parser.add_argument("--raw-json", default=None,
+                        help="raw aggregator JSON (from aggregate --raw); required for --distributions")
+    parser.add_argument("--out",      default="analysis/figures")
+    args = parser.parse_args()
+
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    results = load(args.json)
+    print(f"Loaded {len(results)} result rows.")
+
+    fig_decomposition(results, out_dir)
+    fig_hand_simd(results, out_dir)
+    fig_cliffs_heatmap(results, out_dir)
+    fig_cross_param(results, out_dir)
+    fig_kem_level(results, out_dir)
+
+    if args.raw_json:
+        raw_records = load(args.raw_json)
+        print(f"Loaded {len(raw_records)} raw groups for distributions.")
+        fig_distributions(raw_records, out_dir)
+    else:
+        print("Skipping distributions figure (pass --raw-json to enable).")
+
+
+if __name__ == "__main__":
+    main()
--- a/analysis/figures/cliffs_delta_heatmap.pdf
+++ b/analysis/figures/cliffs_delta_heatmap.pdf
--- a/analysis/figures/cliffs_delta_heatmap.png
+++ b/analysis/figures/cliffs_delta_heatmap.png
--- a/analysis/figures/cross_param.pdf
+++ b/analysis/figures/cross_param.pdf
--- a/analysis/figures/cross_param.png
+++ b/analysis/figures/cross_param.png
--- a/analysis/figures/decomposition.pdf
+++ b/analysis/figures/decomposition.pdf
--- a/analysis/figures/decomposition.png
+++ b/analysis/figures/decomposition.png
--- a/analysis/figures/distributions.pdf
+++ b/analysis/figures/distributions.pdf
--- a/analysis/figures/distributions.png
+++ b/analysis/figures/distributions.png
--- a/analysis/figures/hand_simd_speedup.pdf
+++ b/analysis/figures/hand_simd_speedup.pdf
--- a/analysis/figures/hand_simd_speedup.png
+++ b/analysis/figures/hand_simd_speedup.png
--- a/analysis/figures/kem_level.pdf
+++ b/analysis/figures/kem_level.pdf
--- a/analysis/figures/kem_level.png
+++ b/analysis/figures/kem_level.png
--- a/analysis/go.mod
+++ b/analysis/go.mod
@ -0,0 +1,3 @@
+module git.levineuwirth.org/neuwirth/where-simd-helps/analysis
+
+go 1.26.1
--- a/analysis/pkg/parse/parse.go
+++ b/analysis/pkg/parse/parse.go
@ -0,0 +1,189 @@
+// Package parse reads pqc-bench .out files produced by the SLURM harness.
+//
+// Each file contains a SLURM prolog header followed by 1–N "loop spin" blocks.
+// Each spin block reports one median+average pair per benchmarked operation.
+package parse
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+)
+
+// Meta holds the SLURM prolog metadata extracted from the file header.
+type Meta struct {
+	JobID        string
+	JobName      string
+	Node         string
+	StartedAt    string
+	Directory    string
+	// Explicit fields emitted by submit.sh for reliable downstream parsing.
+	BenchVariant string
+	BenchParam   string
+	BenchNSpins  string
+}
+
+// Measurement is a single operation's reported statistics for one loop spin.
+type Measurement struct {
+	Median  int64
+	Average int64
+}
+
+// Run holds everything parsed from one .out file.
+type Run struct {
+	File string
+	Meta Meta
+	// Spins[i] maps operation name → measurement for loop spin i+1.
+	Spins []map[string]Measurement
+}
+
+// ParseFile reads a single .out file and returns a Run.
+func ParseFile(path string) (*Run, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	run := &Run{File: path}
+	scanner := bufio.NewScanner(f)
+	// Default buffer size is 64KB; lines are short so this is fine.
+
+	var currentSpin map[string]Measurement
+	var currentOp string
+	var pendingMedian int64
+	inSpin := false
+
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+
+		// SLURM prolog lines start with ##
+		if strings.HasPrefix(line, "##") {
+			parsePrologLine(line, &run.Meta)
+			continue
+		}
+
+		// New loop spin
+		if strings.HasPrefix(line, "Loop spin:") {
+			if inSpin && currentSpin != nil {
+				run.Spins = append(run.Spins, currentSpin)
+			}
+			currentSpin = make(map[string]Measurement)
+			currentOp = ""
+			inSpin = true
+			continue
+		}
+
+		if !inSpin {
+			continue
+		}
+
+		// Operation name line ends with ':'
+		if strings.HasSuffix(line, ":") && !strings.HasPrefix(line, "median") && !strings.HasPrefix(line, "average") {
+			currentOp = strings.TrimSuffix(line, ":")
+			currentOp = strings.TrimSpace(currentOp)
+			continue
+		}
+
+		if currentOp == "" {
+			continue
+		}
+
+		if strings.HasPrefix(line, "median:") {
+			v, err := parseCycles(line)
+			if err != nil {
+				return nil, fmt.Errorf("%s: %w", path, err)
+			}
+			pendingMedian = v
+			continue
+		}
+
+		if strings.HasPrefix(line, "average:") {
+			avg, err := parseCycles(line)
+			if err != nil {
+				return nil, fmt.Errorf("%s: %w", path, err)
+			}
+			currentSpin[currentOp] = Measurement{Median: pendingMedian, Average: avg}
+			currentOp = ""
+			pendingMedian = 0
+			continue
+		}
+	}
+
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("%s: %w", path, err)
+	}
+
+	// Flush last spin
+	if inSpin && currentSpin != nil {
+		run.Spins = append(run.Spins, currentSpin)
+	}
+
+	return run, nil
+}
+
+// parseCycles extracts the integer from lines like "median: 25194 cycles/ticks".
+func parseCycles(line string) (int64, error) {
+	// Format: "<label>: <N> cycles/ticks"
+	parts := strings.Fields(line)
+	if len(parts) < 2 {
+		return 0, fmt.Errorf("unexpected line format: %q", line)
+	}
+	return strconv.ParseInt(parts[1], 10, 64)
+}
+
+// parsePrologLine extracts key/value pairs from SLURM header lines.
+func parsePrologLine(line string, meta *Meta) {
+	// Lines look like: "##    Job ID : 11233228"
+	// Strip leading "##" and optional decoration lines ("####...")
+	trimmed := strings.TrimLeft(line, "#")
+	trimmed = strings.TrimSpace(trimmed)
+	key, val, ok := strings.Cut(trimmed, ":")
+	if !ok {
+		return
+	}
+	key = strings.TrimSpace(key)
+	val = strings.TrimSpace(val)
+
+	switch key {
+	case "Job ID":
+		meta.JobID = val
+	case "Job Name":
+		meta.JobName = val
+	case "Nodelist":
+		meta.Node = val
+	case "Job Started":
+		meta.StartedAt = val
+	case "Directory":
+		meta.Directory = val
+	case "BENCH_VARIANT":
+		meta.BenchVariant = val
+	case "BENCH_PARAM":
+		meta.BenchParam = val
+	case "BENCH_NSPINS":
+		meta.BenchNSpins = val
+	}
+}
+
+// InferVariant returns the benchmark variant for a run.
+//
+// Priority:
+//  1. Explicit BENCH_VARIANT metadata emitted by submit.sh (most reliable).
+//  2. The path segment immediately following "kyber/" in the SLURM Directory
+//     field (works for old-style runs that ran from inside the kyber tree).
+//  3. "unknown" if neither is available.
+func InferVariant(meta Meta) string {
+	if meta.BenchVariant != "" {
+		return meta.BenchVariant
+	}
+	const marker = "kyber/"
+	idx := strings.LastIndex(meta.Directory, marker)
+	if idx < 0 {
+		return "unknown"
+	}
+	rest := meta.Directory[idx+len(marker):]
+	variant, _, _ := strings.Cut(rest, "/")
+	return variant
+}
--- a/analysis/pkg/stats/stats.go
+++ b/analysis/pkg/stats/stats.go
@ -0,0 +1,133 @@
+// Package stats computes summary statistics over slices of cycle counts.
+package stats
+
+import (
+	"cmp"
+	"math"
+	"math/rand/v2"
+	"slices"
+)
+
+const bootstrapN = 10_000
+
+// Summary holds all computed statistics for one (algorithm, variant, operation) group.
+type Summary struct {
+	N    int
+	Mean float64
+	// Median is the sample median (p50).
+	Median float64
+	Std    float64
+	MAD    float64
+	P5     float64
+	P25    float64
+	P75    float64
+	P95    float64
+	P99    float64
+	// CI95 is the bootstrapped 95% confidence interval for the median.
+	CI95 [2]float64
+}
+
+// Compute derives all statistics from a sorted (ascending) slice of values.
+// The caller must sort the slice before passing it in.
+func Compute(sorted []int64) Summary {
+	n := len(sorted)
+	if n == 0 {
+		return Summary{}
+	}
+
+	s := Summary{N: n}
+	s.Mean = mean(sorted)
+	s.Median = percentileFromSorted(sorted, 50)
+	s.Std = stddev(sorted, s.Mean)
+	s.MAD = mad(sorted, s.Median)
+	s.P5 = percentileFromSorted(sorted, 5)
+	s.P25 = percentileFromSorted(sorted, 25)
+	s.P75 = percentileFromSorted(sorted, 75)
+	s.P95 = percentileFromSorted(sorted, 95)
+	s.P99 = percentileFromSorted(sorted, 99)
+	s.CI95 = bootstrapMedianCI(sorted, bootstrapN)
+
+	return s
+}
+
+func mean(xs []int64) float64 {
+	var sum float64
+	for _, x := range xs {
+		sum += float64(x)
+	}
+	return sum / float64(len(xs))
+}
+
+func stddev(xs []int64, m float64) float64 {
+	var variance float64
+	for _, x := range xs {
+		d := float64(x) - m
+		variance += d * d
+	}
+	return math.Sqrt(variance / float64(len(xs)))
+}
+
+func mad(sorted []int64, median float64) float64 {
+	devs := make([]float64, len(sorted))
+	for i, x := range sorted {
+		devs[i] = math.Abs(float64(x) - median)
+	}
+	slices.Sort(devs)
+	n := len(devs)
+	if n%2 == 0 {
+		return (devs[n/2-1] + devs[n/2]) / 2
+	}
+	return devs[n/2]
+}
+
+// percentileFromSorted uses linear interpolation (same as numpy's default).
+func percentileFromSorted(sorted []int64, p float64) float64 {
+	n := float64(len(sorted))
+	if n == 1 {
+		return float64(sorted[0])
+	}
+	rank := p / 100 * (n - 1)
+	lo := int(math.Floor(rank))
+	hi := int(math.Ceil(rank))
+	frac := rank - float64(lo)
+	return float64(sorted[lo])*(1-frac) + float64(sorted[hi])*frac
+}
+
+// bootstrapMedianCI resamples the data bootstrapN times and returns the
+// [2.5th, 97.5th] percentile of the bootstrap median distribution.
+func bootstrapMedianCI(sorted []int64, iters int) [2]float64 {
+	n := len(sorted)
+	buf := make([]int64, n)
+	medians := make([]float64, iters)
+
+	for i := range iters {
+		for j := range n {
+			buf[j] = sorted[rand.IntN(n)]
+		}
+		slices.Sort(buf)
+		medians[i] = percentileFromSorted(buf, 50)
+	}
+
+	slices.Sort(medians)
+	return [2]float64{
+		percentile64(medians, 2.5),
+		percentile64(medians, 97.5),
+	}
+}
+
+func percentile64(sorted []float64, p float64) float64 {
+	n := float64(len(sorted))
+	if n == 1 {
+		return sorted[0]
+	}
+	rank := p / 100 * (n - 1)
+	lo := int(math.Floor(rank))
+	hi := int(math.Ceil(rank))
+	frac := rank - float64(lo)
+	return sorted[lo]*(1-frac) + sorted[hi]*frac
+}
+
+// SortInt64 sorts a slice of int64 in place (ascending).
+func SortInt64(xs []int64) {
+	slices.SortFunc(xs, cmp.Compare)
+}
--- a/analysis/results.json
+++ b/analysis/results.json
--- a/analysis/results_raw.json
+++ b/analysis/results_raw.json
--- a/data/raw/kyber/mlkem1024/avx2/1179894.out
+++ b/data/raw/kyber/mlkem1024/avx2/1179894.out
@ -0,0 +1,19 @@
+## SLURM PROLOG ###############################################################
+##    Job ID : 1179894
+##  Job Name : bench_mlkem1024_avx2
+##  Nodelist : node2334
+##      CPUs : 1
+##  Mem/Node : 256 MB
+## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
+##   Job Started : Thu Apr  2 12:18:20 PM EDT 2026
+###############################################################################
+pid 1627591's current affinity list: 41
+## BENCH_VARIANT  : avx2
+## BENCH_PARAM    : 1024
+## BENCH_NSPINS   : 1000
+## BENCH_BINARY   : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem1024_avx2
+## BENCH_DATE     : 2026-04-02T12:18:20-04:00
+## CPU_MODEL      : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
+## PERF_PARANOID  : 2
+## PAPI_BUILD     : OFF
+ERROR: binary not found or not executable: 
--- a/data/raw/kyber/mlkem1024/avx2/1179957.out
+++ b/data/raw/kyber/mlkem1024/avx2/1179957.out
--- a/data/raw/kyber/mlkem1024/avx2/1182913.out
+++ b/data/raw/kyber/mlkem1024/avx2/1182913.out
--- a/data/raw/kyber/mlkem1024/ref/1179893.out
+++ b/data/raw/kyber/mlkem1024/ref/1179893.out
@ -0,0 +1,19 @@
+## SLURM PROLOG ###############################################################
+##    Job ID : 1179893
+##  Job Name : bench_mlkem1024_ref
+##  Nodelist : node2334
+##      CPUs : 1
+##  Mem/Node : 256 MB
+## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
+##   Job Started : Thu Apr  2 12:18:20 PM EDT 2026
+###############################################################################
+pid 1627590's current affinity list: 40
+## BENCH_VARIANT  : ref
+## BENCH_PARAM    : 1024
+## BENCH_NSPINS   : 1000
+## BENCH_BINARY   : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem1024_ref
+## BENCH_DATE     : 2026-04-02T12:18:20-04:00
+## CPU_MODEL      : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
+## PERF_PARANOID  : 2
+## PAPI_BUILD     : OFF
+ERROR: binary not found or not executable: 
--- a/data/raw/kyber/mlkem1024/ref/1179956.out
+++ b/data/raw/kyber/mlkem1024/ref/1179956.out
--- a/data/raw/kyber/mlkem1024/ref/1182912.out
+++ b/data/raw/kyber/mlkem1024/ref/1182912.out
--- a/data/raw/kyber/mlkem1024/refnv/1179958.out
+++ b/data/raw/kyber/mlkem1024/refnv/1179958.out
--- a/data/raw/kyber/mlkem1024/refnv/1182914.out
+++ b/data/raw/kyber/mlkem1024/refnv/1182914.out
--- a/data/raw/kyber/mlkem1024/refo0/1179959.out
+++ b/data/raw/kyber/mlkem1024/refo0/1179959.out
--- a/data/raw/kyber/mlkem1024/refo0/1182915.out
+++ b/data/raw/kyber/mlkem1024/refo0/1182915.out
--- a/data/raw/kyber/mlkem512/avx2/1179890.out
+++ b/data/raw/kyber/mlkem512/avx2/1179890.out
@ -0,0 +1,19 @@
+## SLURM PROLOG ###############################################################
+##    Job ID : 1179890
+##  Job Name : bench_mlkem512_avx2
+##  Nodelist : node2333
+##      CPUs : 1
+##  Mem/Node : 256 MB
+## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
+##   Job Started : Thu Apr  2 12:18:20 PM EDT 2026
+###############################################################################
+pid 2240632's current affinity list: 40
+## BENCH_VARIANT  : avx2
+## BENCH_PARAM    : 512
+## BENCH_NSPINS   : 1000
+## BENCH_BINARY   : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem512_avx2
+## BENCH_DATE     : 2026-04-02T12:18:20-04:00
+## CPU_MODEL      : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
+## PERF_PARANOID  : 2
+## PAPI_BUILD     : OFF
+ERROR: binary not found or not executable: 
--- a/data/raw/kyber/mlkem512/avx2/1179949.out
+++ b/data/raw/kyber/mlkem512/avx2/1179949.out
--- a/data/raw/kyber/mlkem512/avx2/1182905.out
+++ b/data/raw/kyber/mlkem512/avx2/1182905.out
--- a/data/raw/kyber/mlkem512/ref/1179889.out
+++ b/data/raw/kyber/mlkem512/ref/1179889.out
@ -0,0 +1,19 @@
+## SLURM PROLOG ###############################################################
+##    Job ID : 1179889
+##  Job Name : bench_mlkem512_ref
+##  Nodelist : node2333
+##      CPUs : 1
+##  Mem/Node : 256 MB
+## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
+##   Job Started : Thu Apr  2 12:18:20 PM EDT 2026
+###############################################################################
+pid 2240630's current affinity list: 39
+## BENCH_VARIANT  : ref
+## BENCH_PARAM    : 512
+## BENCH_NSPINS   : 1000
+## BENCH_BINARY   : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem512_ref
+## BENCH_DATE     : 2026-04-02T12:18:20-04:00
+## CPU_MODEL      : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
+## PERF_PARANOID  : 2
+## PAPI_BUILD     : OFF
+ERROR: binary not found or not executable: 
--- a/data/raw/kyber/mlkem512/ref/1179948.out
+++ b/data/raw/kyber/mlkem512/ref/1179948.out
--- a/data/raw/kyber/mlkem512/ref/1182904.out
+++ b/data/raw/kyber/mlkem512/ref/1182904.out
--- a/data/raw/kyber/mlkem512/refnv/1179950.out
+++ b/data/raw/kyber/mlkem512/refnv/1179950.out
--- a/data/raw/kyber/mlkem512/refnv/1182906.out
+++ b/data/raw/kyber/mlkem512/refnv/1182906.out
--- a/data/raw/kyber/mlkem512/refo0/1179951.out
+++ b/data/raw/kyber/mlkem512/refo0/1179951.out
--- a/data/raw/kyber/mlkem512/refo0/1182907.out
+++ b/data/raw/kyber/mlkem512/refo0/1182907.out
--- a/data/raw/kyber/mlkem768/avx2/1179892.out
+++ b/data/raw/kyber/mlkem768/avx2/1179892.out
@ -0,0 +1,19 @@
+## SLURM PROLOG ###############################################################
+##    Job ID : 1179892
+##  Job Name : bench_mlkem768_avx2
+##  Nodelist : node2334
+##      CPUs : 1
+##  Mem/Node : 256 MB
+## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
+##   Job Started : Thu Apr  2 12:18:20 PM EDT 2026
+###############################################################################
+pid 1627592's current affinity list: 32
+## BENCH_VARIANT  : avx2
+## BENCH_PARAM    : 768
+## BENCH_NSPINS   : 1000
+## BENCH_BINARY   : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem768_avx2
+## BENCH_DATE     : 2026-04-02T12:18:20-04:00
+## CPU_MODEL      : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
+## PERF_PARANOID  : 2
+## PAPI_BUILD     : OFF
+ERROR: binary not found or not executable: 
--- a/data/raw/kyber/mlkem768/avx2/1179953.out
+++ b/data/raw/kyber/mlkem768/avx2/1179953.out
--- a/data/raw/kyber/mlkem768/avx2/1182909.out
+++ b/data/raw/kyber/mlkem768/avx2/1182909.out
--- a/data/raw/kyber/mlkem768/ref/1179891.out
+++ b/data/raw/kyber/mlkem768/ref/1179891.out
@ -0,0 +1,19 @@
+## SLURM PROLOG ###############################################################
+##    Job ID : 1179891
+##  Job Name : bench_mlkem768_ref
+##  Nodelist : node2333
+##      CPUs : 1
+##  Mem/Node : 256 MB
+## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
+##   Job Started : Thu Apr  2 12:18:20 PM EDT 2026
+###############################################################################
+pid 2240631's current affinity list: 42
+## BENCH_VARIANT  : ref
+## BENCH_PARAM    : 768
+## BENCH_NSPINS   : 1000
+## BENCH_BINARY   : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem768_ref
+## BENCH_DATE     : 2026-04-02T12:18:20-04:00
+## CPU_MODEL      : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
+## PERF_PARANOID  : 2
+## PAPI_BUILD     : OFF
+ERROR: binary not found or not executable: 
--- a/data/raw/kyber/mlkem768/ref/1179952.out
+++ b/data/raw/kyber/mlkem768/ref/1179952.out
--- a/data/raw/kyber/mlkem768/ref/1182908.out
+++ b/data/raw/kyber/mlkem768/ref/1182908.out
--- a/data/raw/kyber/mlkem768/refnv/1179954.out
+++ b/data/raw/kyber/mlkem768/refnv/1179954.out
--- a/data/raw/kyber/mlkem768/refnv/1182910.out
+++ b/data/raw/kyber/mlkem768/refnv/1182910.out
--- a/data/raw/kyber/mlkem768/refo0/1179955.out
+++ b/data/raw/kyber/mlkem768/refo0/1179955.out
--- a/data/raw/kyber/mlkem768/refo0/1182911.out
+++ b/data/raw/kyber/mlkem768/refo0/1182911.out
--- a/harness/CMakeLists.txt
+++ b/harness/CMakeLists.txt
@ -1,57 +1,157 @@
 cmake_minimum_required(VERSION 3.20)
-project(pqc-bench C)
+project(pqc-bench C ASM)

 set(CMAKE_C_STANDARD 11)

-# ── Compiler flags ──────────────────────────────────────────────────────────
-# Release build with full optimization; override on the command line:
-#   cmake -DCMAKE_BUILD_TYPE=Debug ..
 if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE Release)
 endif()

-set(CMAKE_C_FLAGS_RELEASE "-O3 -march=native")
+set(KYBER_ROOT ${CMAKE_SOURCE_DIR}/../algorithms/kyber)

-# ── Algorithm root (submodule) ───────────────────────────────────────────────
-# Each target below compiles a variant of test_speed.c against a specific
-# algorithm build.  Add algorithm libraries as submodule CMake subdirectories
-# or via add_library() here as the project grows.
-#
-# Example layout once kyber submodule is added:
-#   algorithms/kyber/ref/   → static lib kyber512_ref, kyber768_ref, kyber1024_ref
-#   algorithms/kyber/avx2/  → static lib kyber512_avx2, ...
+# ── Helpers shared across variants ──────────────────────────────────────────
+# cpucycles / speed_print live in the kyber ref test dir; both variants use
+# the same copies (avx2/test/ has identical files).
+set(BENCH_HELPERS
+  ${KYBER_ROOT}/ref/test/cpucycles.c
+  ${KYBER_ROOT}/ref/test/speed_print.c
+)

-# ── Harness source ───────────────────────────────────────────────────────────
-set(HARNESS_SRC src/test_speed.c)
+set(HARNESS_SRC ${CMAKE_SOURCE_DIR}/src/test_speed.c)

-# ── Build variants ───────────────────────────────────────────────────────────
-# Uncomment and adjust as algorithm libraries become available.
-#
-# foreach(PARAM 512 768 1024)
-#   foreach(VARIANT ref refnv)
-#     set(TARGET "bench_mlkem${PARAM}_${VARIANT}")
-#     add_executable(${TARGET} ${HARNESS_SRC})
-#     target_include_directories(${TARGET} PRIVATE
-#       ${CMAKE_SOURCE_DIR}/../algorithms/kyber/${VARIANT})
-#     target_link_libraries(${TARGET} kyber${PARAM}_${VARIANT})
-#     target_compile_definitions(${TARGET} PRIVATE KYBER_K=${PARAM})
-#   endforeach()
-# endforeach()
+# ── ref sources (pure C, portable) ──────────────────────────────────────────
+set(REF_DIR ${KYBER_ROOT}/ref)
+set(REF_SOURCES
+  ${REF_DIR}/kem.c
+  ${REF_DIR}/indcpa.c
+  ${REF_DIR}/polyvec.c
+  ${REF_DIR}/poly.c
+  ${REF_DIR}/ntt.c
+  ${REF_DIR}/cbd.c
+  ${REF_DIR}/reduce.c
+  ${REF_DIR}/verify.c
+  ${REF_DIR}/fips202.c
+  ${REF_DIR}/symmetric-shake.c
+  ${REF_DIR}/randombytes.c
+)
+
+# ── avx2 sources (C + x86 assembly) ─────────────────────────────────────────
+set(AVX2_DIR ${KYBER_ROOT}/avx2)
+set(AVX2_SOURCES
+  ${AVX2_DIR}/kem.c
+  ${AVX2_DIR}/indcpa.c
+  ${AVX2_DIR}/polyvec.c
+  ${AVX2_DIR}/poly.c
+  ${AVX2_DIR}/cbd.c
+  ${AVX2_DIR}/verify.c
+  ${AVX2_DIR}/fips202.c
+  ${AVX2_DIR}/fips202x4.c
+  ${AVX2_DIR}/symmetric-shake.c
+  ${AVX2_DIR}/randombytes.c
+  ${AVX2_DIR}/consts.c
+  ${AVX2_DIR}/rejsample.c
+  ${AVX2_DIR}/fq.S
+  ${AVX2_DIR}/shuffle.S
+  ${AVX2_DIR}/ntt.S
+  ${AVX2_DIR}/invntt.S
+  ${AVX2_DIR}/basemul.S
+  ${AVX2_DIR}/keccak4x/KeccakP-1600-times4-SIMD256.c
+)
+
+# ── KYBER_K mapping ──────────────────────────────────────────────────────────
+# 512 → K=2, 768 → K=3, 1024 → K=4
+set(KYBER_K_512  2)
+set(KYBER_K_768  3)
+set(KYBER_K_1024 4)
+
+# ── Build targets ────────────────────────────────────────────────────────────
+foreach(LEVEL 512 768 1024)
+  set(K ${KYBER_K_${LEVEL}})
+
+  # ref — optimised reference (O3, auto-vectorisation enabled)
+  set(REF_TARGET bench_mlkem${LEVEL}_ref)
+  add_executable(${REF_TARGET}
+    ${HARNESS_SRC}
+    ${REF_SOURCES}
+    ${BENCH_HELPERS}
+  )
+  target_include_directories(${REF_TARGET} PRIVATE
+    ${REF_DIR}
+    ${REF_DIR}/test
+  )
+  target_compile_definitions(${REF_TARGET} PRIVATE KYBER_K=${K})
+  target_compile_options(${REF_TARGET} PRIVATE -O3 -fomit-frame-pointer)
+
+  # refnv — ref with auto-vectorisation disabled; isolates scalar O3 performance
+  set(REFNV_TARGET bench_mlkem${LEVEL}_refnv)
+  add_executable(${REFNV_TARGET}
+    ${HARNESS_SRC}
+    ${REF_SOURCES}
+    ${BENCH_HELPERS}
+  )
+  target_include_directories(${REFNV_TARGET} PRIVATE
+    ${REF_DIR}
+    ${REF_DIR}/test
+  )
+  target_compile_definitions(${REFNV_TARGET} PRIVATE KYBER_K=${K})
+  target_compile_options(${REFNV_TARGET} PRIVATE
+    -O3 -fomit-frame-pointer -fno-tree-vectorize
+  )
+
+  # refo0 — ref at -O0; establishes unoptimised baseline
+  set(REFO0_TARGET bench_mlkem${LEVEL}_refo0)
+  add_executable(${REFO0_TARGET}
+    ${HARNESS_SRC}
+    ${REF_SOURCES}
+    ${BENCH_HELPERS}
+  )
+  target_include_directories(${REFO0_TARGET} PRIVATE
+    ${REF_DIR}
+    ${REF_DIR}/test
+  )
+  target_compile_definitions(${REFO0_TARGET} PRIVATE KYBER_K=${K})
+  target_compile_options(${REFO0_TARGET} PRIVATE -O0)
+
+  # avx2 — hand-written AVX2 assembly + O3
+  set(AVX2_TARGET bench_mlkem${LEVEL}_avx2)
+  add_executable(${AVX2_TARGET}
+    ${HARNESS_SRC}
+    ${AVX2_SOURCES}
+    ${BENCH_HELPERS}
+  )
+  target_include_directories(${AVX2_TARGET} PRIVATE
+    ${AVX2_DIR}
+    ${AVX2_DIR}/test
+    ${AVX2_DIR}/keccak4x
+  )
+  target_compile_definitions(${AVX2_TARGET} PRIVATE KYBER_K=${K})
+  target_compile_options(${AVX2_TARGET} PRIVATE
+    -O3 -fomit-frame-pointer -mavx2 -mbmi2 -mpopcnt -march=native -mtune=native
+  )
+endforeach()

 # ── PAPI (hardware performance counters) ─────────────────────────────────────
-# Optional; enable with -DWITH_PAPI=ON
 option(WITH_PAPI "Link against PAPI for hardware counter collection" OFF)
 if(WITH_PAPI)
  find_library(PAPI_LIB papi REQUIRED)
  find_path(PAPI_INCLUDE papi.h REQUIRED)
-  # Targets that need PAPI:
-  # target_include_directories(<target> PRIVATE ${PAPI_INCLUDE})
-  # target_link_libraries(<target> ${PAPI_LIB})
+  foreach(LEVEL 512 768 1024)
+    foreach(VARIANT ref refnv refo0 avx2)
+      set(T bench_mlkem${LEVEL}_${VARIANT})
+      target_include_directories(${T} PRIVATE ${PAPI_INCLUDE})
+      target_link_libraries(${T} ${PAPI_LIB})
+      target_compile_definitions(${T} PRIVATE WITH_PAPI)
+    endforeach()
+  endforeach()
 endif()

 # ── RAPL energy measurement ──────────────────────────────────────────────────
-# Optional; enable with -DWITH_RAPL=ON (requires root or CAP_SYS_RAWIO)
+# Requires root or CAP_SYS_RAWIO on the benchmark node.
 option(WITH_RAPL "Enable Intel RAPL energy measurement" OFF)
 if(WITH_RAPL)
-  # target_compile_definitions(<target> PRIVATE WITH_RAPL)
+  foreach(LEVEL 512 768 1024)
+    foreach(VARIANT ref refnv refo0 avx2)
+      target_compile_definitions(bench_mlkem${LEVEL}_${VARIANT} PRIVATE WITH_RAPL)
+    endforeach()
+  endforeach()
 endif()
--- a/harness/build-papi/CMakeCache.txt
+++ b/harness/build-papi/CMakeCache.txt
@ -0,0 +1,394 @@
+# This is the CMakeCache file.
+# For build in directory: /home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
+# It was generated by CMake: /usr/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+//Path to a program.
+CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
+
+//Path to a program.
+CMAKE_AR:FILEPATH=/usr/bin/ar
+
+//ASM compiler
+CMAKE_ASM_COMPILER:FILEPATH=/usr/bin/cc
+
+//A wrapper around 'ar' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_ASM_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar
+
+//A wrapper around 'ranlib' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_ASM_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib
+
+//Flags used by the ASM compiler during all build types.
+CMAKE_ASM_FLAGS:STRING=
+
+//Flags used by the ASM compiler during DEBUG builds.
+CMAKE_ASM_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the ASM compiler during MINSIZEREL builds.
+CMAKE_ASM_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
+
+//Flags used by the ASM compiler during RELEASE builds.
+CMAKE_ASM_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the ASM compiler during RELWITHDEBINFO builds.
+CMAKE_ASM_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//Choose the type of build, options are: None Debug Release RelWithDebInfo
+// MinSizeRel ...
+CMAKE_BUILD_TYPE:STRING=Release
+
+//Enable/Disable color output during build.
+CMAKE_COLOR_MAKEFILE:BOOL=ON
+
+//C compiler
+CMAKE_C_COMPILER:FILEPATH=/usr/bin/cc
+
+//A wrapper around 'ar' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_C_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar
+
+//A wrapper around 'ranlib' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_C_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib
+
+//Flags used by the C compiler during all build types.
+CMAKE_C_FLAGS:STRING=
+
+//Flags used by the C compiler during DEBUG builds.
+CMAKE_C_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the C compiler during MINSIZEREL builds.
+CMAKE_C_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
+
+//Flags used by the C compiler during RELEASE builds.
+CMAKE_C_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the C compiler during RELWITHDEBINFO builds.
+CMAKE_C_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//Path to a program.
+CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
+
+//Flags used by the linker during all build types.
+CMAKE_EXE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during DEBUG builds.
+CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during MINSIZEREL builds.
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during RELEASE builds.
+CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during RELWITHDEBINFO builds.
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Enable/Disable output of compile commands during generation.
+CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
+
+//Value Computed by CMake.
+CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi/CMakeFiles/pkgRedirects
+
+//Install path prefix, prepended onto install directories.
+CMAKE_INSTALL_PREFIX:PATH=/usr/local
+
+//Path to a program.
+CMAKE_LINKER:FILEPATH=/usr/bin/ld
+
+//Path to a program.
+CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/make
+
+//Flags used by the linker during the creation of modules during
+// all build types.
+CMAKE_MODULE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of modules during
+// DEBUG builds.
+CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of modules during
+// MINSIZEREL builds.
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELEASE builds.
+CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELWITHDEBINFO builds.
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_NM:FILEPATH=/usr/bin/nm
+
+//Path to a program.
+CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
+
+//Path to a program.
+CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
+
+//Value Computed by CMake
+CMAKE_PROJECT_COMPAT_VERSION:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_DESCRIPTION:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_NAME:STATIC=pqc-bench
+
+//Value Computed by CMake
+CMAKE_PROJECT_SPDX_LICENSE:STATIC=
+
+//Path to a program.
+CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
+
+//Path to a program.
+CMAKE_READELF:FILEPATH=/usr/bin/readelf
+
+//Flags used by the linker during the creation of shared libraries
+// during all build types.
+CMAKE_SHARED_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during DEBUG builds.
+CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during MINSIZEREL builds.
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELEASE builds.
+CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELWITHDEBINFO builds.
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//If set, runtime paths are not added when installing shared libraries,
+// but are added when building.
+CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
+
+//If set, runtime paths are not added when using shared libraries.
+CMAKE_SKIP_RPATH:BOOL=NO
+
+//Flags used by the archiver during the creation of static libraries
+// during all build types.
+CMAKE_STATIC_LINKER_FLAGS:STRING=
+
+//Flags used by the archiver during the creation of static libraries
+// during DEBUG builds.
+CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the archiver during the creation of static libraries
+// during MINSIZEREL builds.
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the archiver during the creation of static libraries
+// during RELEASE builds.
+CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the archiver during the creation of static libraries
+// during RELWITHDEBINFO builds.
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_STRIP:FILEPATH=/usr/bin/strip
+
+//Path to a program.
+CMAKE_TAPI:FILEPATH=CMAKE_TAPI-NOTFOUND
+
+//If this value is on, makefiles will be generated without the
+// .SILENT directive, and all commands will be echoed to the console
+// during the make.  This is useful for debugging only. With Visual
+// Studio IDE projects all commands are done without /nologo.
+CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
+
+//Path to a library.
+PAPI_LIB:FILEPATH=PAPI_LIB-NOTFOUND
+
+//Link against PAPI for hardware counter collection
+WITH_PAPI:BOOL=ON
+
+//Value Computed by CMake
+pqc-bench_BINARY_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
+
+//Value Computed by CMake
+pqc-bench_IS_TOP_LEVEL:STATIC=ON
+
+//Value Computed by CMake
+pqc-bench_SOURCE_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//ADVANCED property for variable: CMAKE_ADDR2LINE
+CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_AR
+CMAKE_AR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_ASM_COMPILER
+CMAKE_ASM_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_ASM_COMPILER_AR
+CMAKE_ASM_COMPILER_AR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_ASM_COMPILER_RANLIB
+CMAKE_ASM_COMPILER_RANLIB-ADVANCED:INTERNAL=1
+CMAKE_ASM_COMPILER_WORKS:INTERNAL=1
+//ADVANCED property for variable: CMAKE_ASM_FLAGS
+CMAKE_ASM_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_ASM_FLAGS_DEBUG
+CMAKE_ASM_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_ASM_FLAGS_MINSIZEREL
+CMAKE_ASM_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_ASM_FLAGS_RELEASE
+CMAKE_ASM_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_ASM_FLAGS_RELWITHDEBINFO
+CMAKE_ASM_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=4
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=3
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=1
+//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
+CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
+//ADVANCED property for variable: CMAKE_C_COMPILER
+CMAKE_C_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_COMPILER_AR
+CMAKE_C_COMPILER_AR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_COMPILER_RANLIB
+CMAKE_C_COMPILER_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS
+CMAKE_C_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_DEBUG
+CMAKE_C_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_MINSIZEREL
+CMAKE_C_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_RELEASE
+CMAKE_C_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_RELWITHDEBINFO
+CMAKE_C_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_DLLTOOL
+CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
+//Path to cache edit program executable.
+CMAKE_EDIT_COMMAND:INTERNAL=/usr/bin/ccmake
+//Executable file format
+CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
+CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
+CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
+CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
+CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=Unix Makefiles
+//Generator instance identifier.
+CMAKE_GENERATOR_INSTANCE:INTERNAL=
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=/home/jeans/Repos/research/pqc/where-simd-helps/harness
+//Install .so files without execute permission.
+CMAKE_INSTALL_SO_NO_EXE:INTERNAL=0
+//ADVANCED property for variable: CMAKE_LINKER
+CMAKE_LINKER-ADVANCED:INTERNAL=1
+//Name of CMakeLists files to read
+CMAKE_LIST_FILE_NAME:INTERNAL=CMakeLists.txt
+//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
+CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
+CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
+CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
+CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_NM
+CMAKE_NM-ADVANCED:INTERNAL=1
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJCOPY
+CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJDUMP
+CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
+//Platform information initialized
+CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_RANLIB
+CMAKE_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_READELF
+CMAKE_READELF-ADVANCED:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/usr/share/cmake
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
+CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
+CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
+CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
+CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_RPATH
+CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
+CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
+CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
+CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STRIP
+CMAKE_STRIP-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_TAPI
+CMAKE_TAPI-ADVANCED:INTERNAL=1
+//uname command
+CMAKE_UNAME:INTERNAL=/usr/bin/uname
+//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
+CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
+
--- a/harness/build-papi/CMakeFiles/4.3.1/CMakeASMCompiler.cmake
+++ b/harness/build-papi/CMakeFiles/4.3.1/CMakeASMCompiler.cmake
@ -0,0 +1,30 @@
+set(CMAKE_ASM_COMPILER "/usr/bin/cc")
+set(CMAKE_ASM_COMPILER_ARG1 "")
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_ASM_COMPILER_AR "/usr/bin/gcc-ar")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_ASM_COMPILER_RANLIB "/usr/bin/gcc-ranlib")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_LINKER_LINK "")
+set(CMAKE_LINKER_LLD "")
+set(CMAKE_ASM_COMPILER_LINKER "")
+set(CMAKE_ASM_COMPILER_LINKER_ID "")
+set(CMAKE_ASM_COMPILER_LINKER_VERSION )
+set(CMAKE_ASM_COMPILER_LINKER_FRONTEND_VARIANT )
+set(CMAKE_MT "")
+set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
+set(CMAKE_ASM_COMPILER_LOADED 1)
+set(CMAKE_ASM_COMPILER_ID "GNU")
+set(CMAKE_ASM_COMPILER_VERSION "")
+set(CMAKE_ASM_COMPILER_ENV_VAR "ASM")
+
+set(CMAKE_ASM_COMPILER_ARCHITECTURE_ID "")
+
+
+set(CMAKE_ASM_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
+set(CMAKE_ASM_LINKER_PREFERENCE 0)
+set(CMAKE_ASM_LINKER_DEPFILE_SUPPORTED )
+set(CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
+set(CMAKE_ASM_LINKER_PUSHPOP_STATE_SUPPORTED )
+
+
--- a/harness/build-papi/CMakeFiles/4.3.1/CMakeCCompiler.cmake
+++ b/harness/build-papi/CMakeFiles/4.3.1/CMakeCCompiler.cmake
@ -0,0 +1,85 @@
+set(CMAKE_C_COMPILER "/usr/bin/cc")
+set(CMAKE_C_COMPILER_ARG1 "")
+set(CMAKE_C_COMPILER_ID "GNU")
+set(CMAKE_C_COMPILER_VERSION "15.2.1")
+set(CMAKE_C_COMPILER_VERSION_INTERNAL "")
+set(CMAKE_C_COMPILER_WRAPPER "")
+set(CMAKE_C_STANDARD_COMPUTED_DEFAULT "23")
+set(CMAKE_C_EXTENSIONS_COMPUTED_DEFAULT "ON")
+set(CMAKE_C_STANDARD_LATEST "23")
+set(CMAKE_C_COMPILE_FEATURES "c_std_90;c_function_prototypes;c_std_99;c_restrict;c_variadic_macros;c_std_11;c_static_assert;c_std_17;c_std_23")
+set(CMAKE_C90_COMPILE_FEATURES "c_std_90;c_function_prototypes")
+set(CMAKE_C99_COMPILE_FEATURES "c_std_99;c_restrict;c_variadic_macros")
+set(CMAKE_C11_COMPILE_FEATURES "c_std_11;c_static_assert")
+set(CMAKE_C17_COMPILE_FEATURES "c_std_17")
+set(CMAKE_C23_COMPILE_FEATURES "c_std_23")
+
+set(CMAKE_C_PLATFORM_ID "Linux")
+set(CMAKE_C_SIMULATE_ID "")
+set(CMAKE_C_COMPILER_FRONTEND_VARIANT "GNU")
+set(CMAKE_C_COMPILER_APPLE_SYSROOT "")
+set(CMAKE_C_SIMULATE_VERSION "")
+set(CMAKE_C_COMPILER_ARCHITECTURE_ID "x86_64")
+
+
+
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_C_COMPILER_AR "/usr/bin/gcc-ar")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_C_COMPILER_RANLIB "/usr/bin/gcc-ranlib")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_LINKER_LINK "")
+set(CMAKE_LINKER_LLD "")
+set(CMAKE_C_COMPILER_LINKER "/usr/bin/ld")
+set(CMAKE_C_COMPILER_LINKER_ID "GNU")
+set(CMAKE_C_COMPILER_LINKER_VERSION 2.46)
+set(CMAKE_C_COMPILER_LINKER_FRONTEND_VARIANT GNU)
+set(CMAKE_MT "")
+set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
+set(CMAKE_COMPILER_IS_GNUCC 1)
+set(CMAKE_C_COMPILER_LOADED 1)
+set(CMAKE_C_COMPILER_WORKS TRUE)
+set(CMAKE_C_ABI_COMPILED TRUE)
+
+set(CMAKE_C_COMPILER_ENV_VAR "CC")
+
+set(CMAKE_C_COMPILER_ID_RUN 1)
+set(CMAKE_C_SOURCE_FILE_EXTENSIONS c;m)
+set(CMAKE_C_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
+set(CMAKE_C_LINKER_PREFERENCE 10)
+set(CMAKE_C_LINKER_DEPFILE_SUPPORTED TRUE)
+set(CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
+set(CMAKE_C_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
+
+# Save compiler ABI information.
+set(CMAKE_C_SIZEOF_DATA_PTR "8")
+set(CMAKE_C_COMPILER_ABI "ELF")
+set(CMAKE_C_BYTE_ORDER "LITTLE_ENDIAN")
+set(CMAKE_C_LIBRARY_ARCHITECTURE "")
+
+if(CMAKE_C_SIZEOF_DATA_PTR)
+  set(CMAKE_SIZEOF_VOID_P "${CMAKE_C_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_C_COMPILER_ABI)
+  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_C_COMPILER_ABI}")
+endif()
+
+if(CMAKE_C_LIBRARY_ARCHITECTURE)
+  set(CMAKE_LIBRARY_ARCHITECTURE "")
+endif()
+
+set(CMAKE_C_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_C_CL_SHOWINCLUDES_PREFIX)
+  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_C_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+
+set(CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES "/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1/include;/usr/local/include;/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1/include-fixed;/usr/include")
+set(CMAKE_C_IMPLICIT_LINK_LIBRARIES "gcc;gcc_s;c;gcc;gcc_s")
+set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1;/usr/lib;/lib")
+set(CMAKE_C_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
--- a/harness/build-papi/CMakeFiles/4.3.1/CMakeDetermineCompilerABI_C.bin
+++ b/harness/build-papi/CMakeFiles/4.3.1/CMakeDetermineCompilerABI_C.bin
--- a/harness/build-papi/CMakeFiles/4.3.1/CMakeSystem.cmake
+++ b/harness/build-papi/CMakeFiles/4.3.1/CMakeSystem.cmake
@ -0,0 +1,15 @@
+set(CMAKE_HOST_SYSTEM "Linux-6.19.10-arch1-1")
+set(CMAKE_HOST_SYSTEM_NAME "Linux")
+set(CMAKE_HOST_SYSTEM_VERSION "6.19.10-arch1-1")
+set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
+
+
+
+set(CMAKE_SYSTEM "Linux-6.19.10-arch1-1")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_VERSION "6.19.10-arch1-1")
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+
+set(CMAKE_CROSSCOMPILING "FALSE")
+
+set(CMAKE_SYSTEM_LOADED 1)
--- a/harness/build-papi/CMakeFiles/4.3.1/CompilerIdC/CMakeCCompilerId.c
+++ b/harness/build-papi/CMakeFiles/4.3.1/CompilerIdC/CMakeCCompilerId.c
@ -0,0 +1,934 @@
+#ifdef __cplusplus
+# error "A C++ compiler has been selected for C."
+#endif
+
+#if defined(__18CXX)
+# define ID_VOID_MAIN
+#endif
+#if defined(__CLASSIC_C__)
+/* cv-qualifiers did not exist in K&R C */
+# define const
+# define volatile
+#endif
+
+#if !defined(__has_include)
+/* If the compiler does not have __has_include, pretend the answer is
+   always no.  */
+#  define __has_include(x) 0
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
+
+#if defined(__INTEL_COMPILER) || defined(__ICC)
+# define COMPILER_ID "Intel"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_ID "GNU"
+# endif
+  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
+     except that a few beta releases use the old format with V=2021.  */
+# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
+#  if defined(__INTEL_COMPILER_UPDATE)
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
+#  else
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
+#  endif
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
+   /* The third version component from --version is an update index,
+      but no macro is provided for it.  */
+#  define COMPILER_VERSION_PATCH DEC(0)
+# endif
+# if defined(__INTEL_COMPILER_BUILD_DATE)
+   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
+#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
+# endif
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# elif defined(__GNUG__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
+# define COMPILER_ID "IntelLLVM"
+#if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_ID "GNU"
+#endif
+/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
+ * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
+ * VVVV is no smaller than the current year when a version is released.
+ */
+#if __INTEL_LLVM_COMPILER < 1000000L
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
+#else
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
+#endif
+#if defined(_MSC_VER)
+  /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#elif defined(__GNUG__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+#endif
+#if defined(__GNUC_MINOR__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#endif
+#if defined(__GNUC_PATCHLEVEL__)
+# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#endif
+
+#elif defined(__PATHCC__)
+# define COMPILER_ID "PathScale"
+# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
+# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
+# if defined(__PATHCC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
+# endif
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# define COMPILER_ID "Embarcadero"
+# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
+# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
+# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
+
+#elif defined(__BORLANDC__)
+# define COMPILER_ID "Borland"
+  /* __BORLANDC__ = 0xVRR */
+# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
+# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# define COMPILER_ID "Watcom"
+   /* __WATCOMC__ = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__WATCOMC__)
+# define COMPILER_ID "OpenWatcom"
+   /* __WATCOMC__ = VVRP + 1100 */
+# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__SUNPRO_C)
+# define COMPILER_ID "SunPro"
+# if __SUNPRO_C >= 0x5100
+   /* __SUNPRO_C = 0xVRRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>12)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xFF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_C    & 0xF)
+# else
+   /* __SUNPRO_CC = 0xVRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>8)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_C    & 0xF)
+# endif
+
+#elif defined(__HP_cc)
+# define COMPILER_ID "HP"
+  /* __HP_cc = VVRRPP */
+# define COMPILER_VERSION_MAJOR DEC(__HP_cc/10000)
+# define COMPILER_VERSION_MINOR DEC(__HP_cc/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__HP_cc     % 100)
+
+#elif defined(__DECC)
+# define COMPILER_ID "Compaq"
+  /* __DECC_VER = VVRRTPPPP */
+# define COMPILER_VERSION_MAJOR DEC(__DECC_VER/10000000)
+# define COMPILER_VERSION_MINOR DEC(__DECC_VER/100000  % 100)
+# define COMPILER_VERSION_PATCH DEC(__DECC_VER         % 10000)
+
+#elif defined(__IBMC__) && defined(__COMPILER_VER__)
+# define COMPILER_ID "zOS"
+  /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
+
+#elif defined(__open_xl__) && defined(__clang__)
+# define COMPILER_ID "IBMClang"
+# define COMPILER_VERSION_MAJOR DEC(__open_xl_version__)
+# define COMPILER_VERSION_MINOR DEC(__open_xl_release__)
+# define COMPILER_VERSION_PATCH DEC(__open_xl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__open_xl_ptf_fix_level__)
+# define COMPILER_VERSION_INTERNAL_STR  __clang_version__
+
+
+#elif defined(__ibmxl__) && defined(__clang__)
+# define COMPILER_ID "XLClang"
+# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
+# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
+# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
+
+
+#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800
+# define COMPILER_ID "XL"
+  /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
+
+#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800
+# define COMPILER_ID "VisualAge"
+  /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
+
+#elif defined(__NVCOMPILER)
+# define COMPILER_ID "NVHPC"
+# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
+# if defined(__NVCOMPILER_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
+# endif
+
+#elif defined(__PGI)
+# define COMPILER_ID "PGI"
+# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
+# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
+# if defined(__PGIC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
+# endif
+
+#elif defined(__clang__) && defined(__cray__)
+# define COMPILER_ID "CrayClang"
+# define COMPILER_VERSION_MAJOR DEC(__cray_major__)
+# define COMPILER_VERSION_MINOR DEC(__cray_minor__)
+# define COMPILER_VERSION_PATCH DEC(__cray_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(_CRAYC)
+# define COMPILER_ID "Cray"
+# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
+
+#elif defined(__TI_COMPILER_VERSION__)
+# define COMPILER_ID "TI"
+  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
+# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
+# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
+# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
+
+#elif defined(__CLANG_FUJITSU)
+# define COMPILER_ID "FujitsuClang"
+# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(__FUJITSU)
+# define COMPILER_ID "Fujitsu"
+# if defined(__FCC_version__)
+#   define COMPILER_VERSION __FCC_version__
+# elif defined(__FCC_major__)
+#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# endif
+# if defined(__fcc_version)
+#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
+# elif defined(__FCC_VERSION)
+#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
+# endif
+
+
+#elif defined(__ghs__)
+# define COMPILER_ID "GHS"
+/* __GHS_VERSION_NUMBER = VVVVRP */
+# ifdef __GHS_VERSION_NUMBER
+# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
+# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
+# endif
+
+#elif defined(__TASKING__)
+# define COMPILER_ID "Tasking"
+  # define COMPILER_VERSION_MAJOR DEC(__VERSION__/1000)
+  # define COMPILER_VERSION_MINOR DEC(__VERSION__ % 100)
+# define COMPILER_VERSION_INTERNAL DEC(__VERSION__)
+
+#elif defined(__ORANGEC__)
+# define COMPILER_ID "OrangeC"
+# define COMPILER_VERSION_MAJOR DEC(__ORANGEC_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__ORANGEC_MINOR__)
+# define COMPILER_VERSION_PATCH DEC(__ORANGEC_PATCHLEVEL__)
+
+#elif defined(__RENESAS__)
+# define COMPILER_ID "Renesas"
+/* __RENESAS_VERSION__ = 0xVVRRPP00 */
+# define COMPILER_VERSION_MAJOR HEX(__RENESAS_VERSION__ >> 24 & 0xFF)
+# define COMPILER_VERSION_MINOR HEX(__RENESAS_VERSION__ >> 16 & 0xFF)
+# define COMPILER_VERSION_PATCH HEX(__RENESAS_VERSION__ >> 8  & 0xFF)
+
+#elif defined(__TINYC__)
+# define COMPILER_ID "TinyCC"
+
+#elif defined(__BCC__)
+# define COMPILER_ID "Bruce"
+
+#elif defined(__SCO_VERSION__)
+# define COMPILER_ID "SCO"
+
+#elif defined(__ARMCC_VERSION) && !defined(__clang__)
+# define COMPILER_ID "ARMCC"
+#if __ARMCC_VERSION >= 1000000
+  /* __ARMCC_VERSION = VRRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
+#else
+  /* __ARMCC_VERSION = VRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
+#endif
+
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# define COMPILER_ID "AppleClang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
+
+#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
+# define COMPILER_ID "ARMClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION/100   % 100)
+# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
+
+#elif defined(__clang__) && defined(__ti__)
+# define COMPILER_ID "TIClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ti_major__)
+  # define COMPILER_VERSION_MINOR DEC(__ti_minor__)
+  # define COMPILER_VERSION_PATCH DEC(__ti_patchlevel__)
+# define COMPILER_VERSION_INTERNAL DEC(__ti_version__)
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__LCC__) && (defined(__GNUC__) || defined(__GNUG__) || defined(__MCST__))
+# define COMPILER_ID "LCC"
+# define COMPILER_VERSION_MAJOR DEC(__LCC__ / 100)
+# define COMPILER_VERSION_MINOR DEC(__LCC__ % 100)
+# if defined(__LCC_MINOR__)
+#  define COMPILER_VERSION_PATCH DEC(__LCC_MINOR__)
+# endif
+# if defined(__GNUC__) && defined(__GNUC_MINOR__)
+#  define SIMULATE_ID "GNU"
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#  if defined(__GNUC_PATCHLEVEL__)
+#   define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#  endif
+# endif
+
+#elif defined(__GNUC__)
+# define COMPILER_ID "GNU"
+# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
+# if defined(__GNUC_MINOR__)
+#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(_MSC_VER)
+# define COMPILER_ID "MSVC"
+  /* _MSC_VER = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
+# if defined(_MSC_FULL_VER)
+#  if _MSC_VER >= 1400
+    /* _MSC_FULL_VER = VVRRPPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
+#  else
+    /* _MSC_FULL_VER = VVRRPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
+#  endif
+# endif
+# if defined(_MSC_BUILD)
+#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
+# endif
+
+#elif defined(_ADI_COMPILER)
+# define COMPILER_ID "ADSP"
+#if defined(__VERSIONNUM__)
+  /* __VERSIONNUM__ = 0xVVRRPPTT */
+#  define COMPILER_VERSION_MAJOR DEC(__VERSIONNUM__ >> 24 & 0xFF)
+#  define COMPILER_VERSION_MINOR DEC(__VERSIONNUM__ >> 16 & 0xFF)
+#  define COMPILER_VERSION_PATCH DEC(__VERSIONNUM__ >> 8 & 0xFF)
+#  define COMPILER_VERSION_TWEAK DEC(__VERSIONNUM__ & 0xFF)
+#endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# define COMPILER_ID "IAR"
+# if defined(__VER__) && defined(__ICCARM__)
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
+#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
+#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
+#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
+#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# endif
+
+#elif defined(__DCC__) && defined(_DIAB_TOOL)
+# define COMPILER_ID "Diab"
+  # define COMPILER_VERSION_MAJOR DEC(__VERSION_MAJOR_NUMBER__)
+  # define COMPILER_VERSION_MINOR DEC(__VERSION_MINOR_NUMBER__)
+  # define COMPILER_VERSION_PATCH DEC(__VERSION_ARCH_FEATURE_NUMBER__)
+  # define COMPILER_VERSION_TWEAK DEC(__VERSION_BUG_FIX_NUMBER__)
+
+
+#elif defined(__SDCC_VERSION_MAJOR) || defined(SDCC)
+# define COMPILER_ID "SDCC"
+# if defined(__SDCC_VERSION_MAJOR)
+#  define COMPILER_VERSION_MAJOR DEC(__SDCC_VERSION_MAJOR)
+#  define COMPILER_VERSION_MINOR DEC(__SDCC_VERSION_MINOR)
+#  define COMPILER_VERSION_PATCH DEC(__SDCC_VERSION_PATCH)
+# else
+  /* SDCC = VRP */
+#  define COMPILER_VERSION_MAJOR DEC(SDCC/100)
+#  define COMPILER_VERSION_MINOR DEC(SDCC/10 % 10)
+#  define COMPILER_VERSION_PATCH DEC(SDCC    % 10)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+  identification macro.  Try to identify the platform and guess that
+  it is the native compiler.  */
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#ifdef __QNXNTO__
+char const* qnxnto = "INFO" ":" "qnxnto[]";
+#endif
+
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name.  */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__MSYS__)
+# define PLATFORM_ID "MSYS"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+#  define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+#  define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+#  define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+#  define PLATFORM_ID "Windows3x"
+
+# elif defined(__VXWORKS__)
+#  define PLATFORM_ID "VxWorks"
+
+# else /* unknown platform */
+#  define PLATFORM_ID
+# endif
+
+#elif defined(__INTEGRITY)
+# if defined(INT_178B)
+#  define PLATFORM_ID "Integrity178"
+
+# else /* regular Integrity */
+#  define PLATFORM_ID "Integrity"
+# endif
+
+# elif defined(_ADI_COMPILER)
+#  define PLATFORM_ID "ADSP"
+
+#else /* unknown platform */
+# define PLATFORM_ID
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+   the architecture of the compiler being used.  This is because
+   the compilers do not have flags that can change the architecture,
+   but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+#  define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_ARM64EC)
+#  define ARCHITECTURE_ID "ARM64EC"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM64)
+#  define ARCHITECTURE_ID "ARM64"
+
+# elif defined(_M_ARM)
+#  if _M_ARM == 4
+#   define ARCHITECTURE_ID "ARMV4I"
+#  elif _M_ARM == 5
+#   define ARCHITECTURE_ID "ARMV5I"
+#  else
+#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+#  endif
+
+# elif defined(_M_MIPS)
+#  define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+#  define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+#  define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# if defined(__ICCARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__ICCRX__)
+#  define ARCHITECTURE_ID "RX"
+
+# elif defined(__ICCRH850__)
+#  define ARCHITECTURE_ID "RH850"
+
+# elif defined(__ICCRL78__)
+#  define ARCHITECTURE_ID "RL78"
+
+# elif defined(__ICCRISCV__)
+#  define ARCHITECTURE_ID "RISCV"
+
+# elif defined(__ICCAVR__)
+#  define ARCHITECTURE_ID "AVR"
+
+# elif defined(__ICC430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__ICCV850__)
+#  define ARCHITECTURE_ID "V850"
+
+# elif defined(__ICC8051__)
+#  define ARCHITECTURE_ID "8051"
+
+# elif defined(__ICCSTM8__)
+#  define ARCHITECTURE_ID "STM8"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__ghs__)
+# if defined(__PPC64__)
+#  define ARCHITECTURE_ID "PPC64"
+
+# elif defined(__ppc__)
+#  define ARCHITECTURE_ID "PPC"
+
+# elif defined(__ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__x86_64__)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(__i386__)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__clang__) && defined(__ti__)
+# if defined(__ARM_ARCH)
+#  define ARCHITECTURE_ID "ARM"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__TI_COMPILER_VERSION__)
+# if defined(__TI_ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__MSP430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__TMS320C28XX__)
+#  define ARCHITECTURE_ID "TMS320C28x"
+
+# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
+#  define ARCHITECTURE_ID "TMS320C6x"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+# elif defined(__ADSPSHARC__)
+#  define ARCHITECTURE_ID "SHARC"
+
+# elif defined(__ADSPBLACKFIN__)
+#  define ARCHITECTURE_ID "Blackfin"
+
+#elif defined(__TASKING__)
+
+# if defined(__CTC__) || defined(__CPTC__)
+#  define ARCHITECTURE_ID "TriCore"
+
+# elif defined(__CMCS__)
+#  define ARCHITECTURE_ID "MCS"
+
+# elif defined(__CARM__) || defined(__CPARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__CARC__)
+#  define ARCHITECTURE_ID "ARC"
+
+# elif defined(__C51__)
+#  define ARCHITECTURE_ID "8051"
+
+# elif defined(__CPCP__)
+#  define ARCHITECTURE_ID "PCP"
+
+# else
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__RENESAS__)
+# if defined(__CCRX__)
+#  define ARCHITECTURE_ID "RX"
+
+# elif defined(__CCRL__)
+#  define ARCHITECTURE_ID "RL78"
+
+# elif defined(__CCRH__)
+#  define ARCHITECTURE_ID "RH850"
+
+# else
+#  define ARCHITECTURE_ID ""
+# endif
+
+#else
+#  define ARCHITECTURE_ID
+#endif
+
+/* Convert integer to decimal digit literals.  */
+#define DEC(n)                   \
+  ('0' + (((n) / 10000000)%10)), \
+  ('0' + (((n) / 1000000)%10)),  \
+  ('0' + (((n) / 100000)%10)),   \
+  ('0' + (((n) / 10000)%10)),    \
+  ('0' + (((n) / 1000)%10)),     \
+  ('0' + (((n) / 100)%10)),      \
+  ('0' + (((n) / 10)%10)),       \
+  ('0' +  ((n) % 10))
+
+/* Convert integer to hex digit literals.  */
+#define HEX(n)             \
+  ('0' + ((n)>>28 & 0xF)), \
+  ('0' + ((n)>>24 & 0xF)), \
+  ('0' + ((n)>>20 & 0xF)), \
+  ('0' + ((n)>>16 & 0xF)), \
+  ('0' + ((n)>>12 & 0xF)), \
+  ('0' + ((n)>>8  & 0xF)), \
+  ('0' + ((n)>>4  & 0xF)), \
+  ('0' + ((n)     & 0xF))
+
+/* Construct a string literal encoding the version number. */
+#ifdef COMPILER_VERSION
+char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
+
+/* Construct a string literal encoding the version number components. */
+#elif defined(COMPILER_VERSION_MAJOR)
+char const info_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+  COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+  '.', COMPILER_VERSION_MINOR,
+#  ifdef COMPILER_VERSION_PATCH
+   '.', COMPILER_VERSION_PATCH,
+#   ifdef COMPILER_VERSION_TWEAK
+    '.', COMPILER_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct a string literal encoding the internal version number. */
+#ifdef COMPILER_VERSION_INTERNAL
+char const info_version_internal[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
+  'i','n','t','e','r','n','a','l','[',
+  COMPILER_VERSION_INTERNAL,']','\0'};
+#elif defined(COMPILER_VERSION_INTERNAL_STR)
+char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+  SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+  '.', SIMULATE_VERSION_MINOR,
+#  ifdef SIMULATE_VERSION_PATCH
+   '.', SIMULATE_VERSION_PATCH,
+#   ifdef SIMULATE_VERSION_TWEAK
+    '.', SIMULATE_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+#define C_STD_99 199901L
+#define C_STD_11 201112L
+#define C_STD_17 201710L
+#define C_STD_23 202311L
+
+#ifdef __STDC_VERSION__
+#  define C_STD __STDC_VERSION__
+#endif
+
+#if !defined(__STDC__) && !defined(__clang__) && !defined(__RENESAS__)
+# if defined(_MSC_VER) || defined(__ibmxl__) || defined(__IBMC__)
+#  define C_VERSION "90"
+# else
+#  define C_VERSION
+# endif
+#elif C_STD > C_STD_17
+# define C_VERSION "23"
+#elif C_STD > C_STD_11
+# define C_VERSION "17"
+#elif C_STD > C_STD_99
+# define C_VERSION "11"
+#elif C_STD >= C_STD_99
+# define C_VERSION "99"
+#else
+# define C_VERSION "90"
+#endif
+const char* info_language_standard_default =
+  "INFO" ":" "standard_default[" C_VERSION "]";
+
+const char* info_language_extensions_default = "INFO" ":" "extensions_default["
+#if (defined(__clang__) || defined(__GNUC__) || defined(__xlC__) ||           \
+     defined(__TI_COMPILER_VERSION__) || defined(__RENESAS__)) &&             \
+  !defined(__STRICT_ANSI__)
+  "ON"
+#else
+  "OFF"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+#ifdef ID_VOID_MAIN
+void main() {}
+#else
+# if defined(__CLASSIC_C__)
+int main(argc, argv) int argc; char *argv[];
+# else
+int main(int argc, char* argv[])
+# endif
+{
+  int require = 0;
+  require += info_compiler[argc];
+  require += info_platform[argc];
+  require += info_arch[argc];
+#ifdef COMPILER_VERSION_MAJOR
+  require += info_version[argc];
+#endif
+#if defined(COMPILER_VERSION_INTERNAL) || defined(COMPILER_VERSION_INTERNAL_STR)
+  require += info_version_internal[argc];
+#endif
+#ifdef SIMULATE_ID
+  require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+  require += info_simulate_version[argc];
+#endif
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+  require += info_cray[argc];
+#endif
+  require += info_language_standard_default[argc];
+  require += info_language_extensions_default[argc];
+  (void)argv;
+  return require;
+}
+#endif
--- a/harness/build-papi/CMakeFiles/4.3.1/CompilerIdC/a.out
+++ b/harness/build-papi/CMakeFiles/4.3.1/CompilerIdC/a.out
--- a/harness/build-papi/CMakeFiles/CMakeConfigureLog.yaml
+++ b/harness/build-papi/CMakeFiles/CMakeConfigureLog.yaml
--- a/harness/build-papi/CMakeFiles/cmake.check_cache
+++ b/harness/build-papi/CMakeFiles/cmake.check_cache
@ -0,0 +1 @@
+# This file is generated by cmake for dependency checking of the CMakeCache.txt file
--- a/harness/src/test_speed.c
+++ b/harness/src/test_speed.c
@ -1,28 +1,189 @@
 /*
- * This file comes from the Kyber repo; see the files in kyber/avx2/test or kyber/ref/test for further details.
+ * pqc-bench harness — cycle-count + optional PAPI hardware counter benchmarks.
+ *
+ * Usage: <binary> [nspins]
+ *   nspins  number of outer loop-spin iterations (default: 1)
+ *
+ * Each spin runs all operations with NTESTS inner iterations and prints one
+ * median/average pair per operation. With WITH_PAPI, additional lines are
+ * emitted per hardware counter using the same parseable format.
+ *
+ * Build flags:
+ *   -DWITH_PAPI   link against PAPI and emit hardware counter lines
 */

 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include "../kem.h"
-#include "../params.h"
-#include "../indcpa.h"
-#include "../polyvec.h"
-#include "../poly.h"
-#include "../randombytes.h"
+#include <string.h>
+#include "kem.h"
+#include "params.h"
+#include "indcpa.h"
+#include "polyvec.h"
+#include "poly.h"
+#include "randombytes.h"
 #include "cpucycles.h"
 #include "speed_print.h"

+#ifdef WITH_PAPI
+#include <papi.h>
+#endif
+
 #define NTESTS 1000

-uint64_t t[NTESTS];
-uint8_t seed[KYBER_SYMBYTES] = {0};
+/* ── PAPI instrumentation ───────────────────────────────────────────────── */
+#ifdef WITH_PAPI

-int main(void)
+typedef struct {
+  int   code;
+  const char *name;
+} papi_event_def;
+
+static const papi_event_def DESIRED_EVENTS[] = {
+  { PAPI_TOT_INS, "instructions"    },
+  { PAPI_L1_DCM,  "l1_misses"       },
+  { PAPI_L2_TCM,  "l2_misses"       },
+  { PAPI_L3_TCM,  "l3_misses"       },
+  { PAPI_BR_MSP,  "branch_mispreds" },
+};
+#define MAX_EVENTS ((int)(sizeof(DESIRED_EVENTS) / sizeof(DESIRED_EVENTS[0])))
+
+static int        papi_eventset         = PAPI_NULL;
+static int        active_codes[MAX_EVENTS];
+static const char *active_names[MAX_EVENTS];
+static int        n_active              = 0;
+static int        papi_ok               = 0; /* set to 1 if init succeeded */
+
+static void papi_init(void) {
+  int ret;
+
+  ret = PAPI_library_init(PAPI_VER_CURRENT);
+  if (ret != PAPI_VER_CURRENT) {
+    fprintf(stderr, "PAPI_library_init: %s — hardware counters disabled\n",
+            PAPI_strerror(ret));
+    return;
+  }
+
+  if ((ret = PAPI_create_eventset(&papi_eventset)) != PAPI_OK) {
+    fprintf(stderr, "PAPI_create_eventset: %s — hardware counters disabled\n",
+            PAPI_strerror(ret));
+    return;
+  }
+
+  for (int i = 0; i < MAX_EVENTS; i++) {
+    if (PAPI_query_event(DESIRED_EVENTS[i].code) != PAPI_OK) {
+      fprintf(stderr, "PAPI: event %s not available on this hardware, skipping\n",
+              DESIRED_EVENTS[i].name);
+      continue;
+    }
+    ret = PAPI_add_event(papi_eventset, DESIRED_EVENTS[i].code);
+    if (ret != PAPI_OK) {
+      fprintf(stderr, "PAPI_add_event(%s): %s — skipping\n",
+              DESIRED_EVENTS[i].name, PAPI_strerror(ret));
+      continue;
+    }
+    active_codes[n_active] = DESIRED_EVENTS[i].code;
+    active_names[n_active] = DESIRED_EVENTS[i].name;
+    n_active++;
+  }
+
+  if (n_active == 0) {
+    fprintf(stderr, "PAPI: no events could be added — hardware counters disabled\n");
+    return;
+  }
+
+  if ((ret = PAPI_start(papi_eventset)) != PAPI_OK) {
+    fprintf(stderr, "PAPI_start: %s — hardware counters disabled\n",
+            PAPI_strerror(ret));
+    return;
+  }
+
+  papi_ok = 1;
+}
+
+/*
+ * papi_print — print per-call counter values for one (op, counter) pair.
+ * Both "median" and "average" are set to the same per-call value; the outer
+ * loop-spin structure gives the aggregation tool a real distribution.
+ * The IPC line uses a float value multiplied by 1000 for integer storage;
+ * the analysis tool divides by 1000 to recover IPC.
+ */
+static void papi_print(const char *op, const char *counter,
+                       long long total, int ntests)
 {
-  unsigned int i;
+  long long per_call = total / ntests;
+  printf("%s_%s: \nmedian: %lld per_call\naverage: %lld per_call\n\n",
+         op, counter, per_call, per_call);
+}
+
+/*
+ * papi_bench — read counters around an already-executed NTESTS block.
+ * Call papi_read_before() immediately before the loop and
+ * papi_bench_report() immediately after.
+ */
+static long long _papi_before[MAX_EVENTS];
+static long long _papi_after[MAX_EVENTS];
+
+static inline void papi_read_before(void) {
+  if (papi_ok) PAPI_read(papi_eventset, _papi_before);
+}
+
+static void papi_bench_report(const char *op) {
+  if (!papi_ok) return;
+  PAPI_read(papi_eventset, _papi_after);
+  for (int e = 0; e < n_active; e++) {
+    long long delta = _papi_after[e] - _papi_before[e];
+    papi_print(op, active_names[e], delta, NTESTS);
+  }
+}
+
+#define PAPI_BEFORE()        papi_read_before()
+#define PAPI_AFTER(op)       papi_bench_report(op)
+
+#else /* !WITH_PAPI */
+
+static inline void papi_init(void) {}
+#define PAPI_BEFORE()        ((void)0)
+#define PAPI_AFTER(op)       ((void)0)
+
+#endif /* WITH_PAPI */
+
+/* ── Benchmark helpers ───────────────────────────────────────────────────── */
+
+/*
+ * BENCH(label, body) — time NTESTS executions of body, print results, then
+ * emit PAPI counter lines if enabled.
+ */
+#define BENCH(label, body)                     \
+  do {                                         \
+    PAPI_BEFORE();                             \
+    for (unsigned int _i = 0; _i < NTESTS; _i++) { \
+      t[_i] = cpucycles();                     \
+      body;                                    \
+    }                                          \
+    print_results(label ": ", t, NTESTS);      \
+    PAPI_AFTER(label);                         \
+  } while (0)
+
+/* ── Main ────────────────────────────────────────────────────────────────── */
+
+static uint64_t t[NTESTS];
+static uint8_t  seed[KYBER_SYMBYTES] = {0};
+
+int main(int argc, char *argv[])
+{
+  int nspins = 1;
+  if (argc > 1) {
+    nspins = atoi(argv[1]);
+    if (nspins <= 0) {
+      fprintf(stderr, "usage: %s [nspins]\n", argv[0]);
+      return 1;
+    }
+  }
+
+  papi_init();
+
  uint8_t pk[CRYPTO_PUBLICKEYBYTES];
  uint8_t sk[CRYPTO_SECRETKEYBYTES];
  uint8_t ct[CRYPTO_CIPHERTEXTBYTES];
@ -30,130 +191,74 @@ int main(void)
  uint8_t coins32[KYBER_SYMBYTES];
  uint8_t coins64[2*KYBER_SYMBYTES];
  polyvec matrix[KYBER_K];
-  poly ap;
+  poly    ap;

  randombytes(coins32, KYBER_SYMBYTES);
  randombytes(coins64, 2*KYBER_SYMBYTES);

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    gen_matrix(matrix, seed, 0);
-  }
-  print_results("gen_a: ", t, NTESTS);
+  for (int spin = 1; spin <= nspins; spin++) {
+    printf("Loop spin: %d\n", spin);

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    poly_getnoise_eta1(&ap, seed, 0);
-  }
-  print_results("poly_getnoise_eta1: ", t, NTESTS);
+    BENCH("gen_a",
+      gen_matrix(matrix, seed, 0));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    poly_getnoise_eta2(&ap, seed, 0);
-  }
-  print_results("poly_getnoise_eta2: ", t, NTESTS);
+    BENCH("poly_getnoise_eta1",
+      poly_getnoise_eta1(&ap, seed, 0));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    poly_ntt(&ap);
-  }
-  print_results("NTT: ", t, NTESTS);
+    BENCH("poly_getnoise_eta2",
+      poly_getnoise_eta2(&ap, seed, 0));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    poly_invntt_tomont(&ap);
-  }
-  print_results("INVNTT: ", t, NTESTS);
+    BENCH("NTT",
+      poly_ntt(&ap));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    polyvec_basemul_acc_montgomery(&ap, &matrix[0], &matrix[1]);
-  }
-  print_results("polyvec_basemul_acc_montgomery: ", t, NTESTS);
+    BENCH("INVNTT",
+      poly_invntt_tomont(&ap));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    poly_tomsg(ct,&ap);
-  }
-  print_results("poly_tomsg: ", t, NTESTS);
+    BENCH("polyvec_basemul_acc_montgomery",
+      polyvec_basemul_acc_montgomery(&ap, &matrix[0], &matrix[1]));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    poly_frommsg(&ap,ct);
-  }
-  print_results("poly_frommsg: ", t, NTESTS);
+    BENCH("poly_tomsg",
+      poly_tomsg(ct, &ap));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    poly_compress(ct,&ap);
-  }
-  print_results("poly_compress: ", t, NTESTS);
+    BENCH("poly_frommsg",
+      poly_frommsg(&ap, ct));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    poly_decompress(&ap,ct);
-  }
-  print_results("poly_decompress: ", t, NTESTS);
+    BENCH("poly_compress",
+      poly_compress(ct, &ap));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    polyvec_compress(ct,&matrix[0]);
-  }
-  print_results("polyvec_compress: ", t, NTESTS);
+    BENCH("poly_decompress",
+      poly_decompress(&ap, ct));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    polyvec_decompress(&matrix[0],ct);
-  }
-  print_results("polyvec_decompress: ", t, NTESTS);
+    BENCH("polyvec_compress",
+      polyvec_compress(ct, &matrix[0]));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    indcpa_keypair_derand(pk, sk, coins32);
-  }
-  print_results("indcpa_keypair: ", t, NTESTS);
+    BENCH("polyvec_decompress",
+      polyvec_decompress(&matrix[0], ct));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    indcpa_enc(ct, key, pk, seed);
-  }
-  print_results("indcpa_enc: ", t, NTESTS);
+    BENCH("indcpa_keypair",
+      indcpa_keypair_derand(pk, sk, coins32));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    indcpa_dec(key, ct, sk);
-  }
-  print_results("indcpa_dec: ", t, NTESTS);
+    BENCH("indcpa_enc",
+      indcpa_enc(ct, key, pk, seed));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    crypto_kem_keypair_derand(pk, sk, coins64);
-  }
-  print_results("kyber_keypair_derand: ", t, NTESTS);
+    BENCH("indcpa_dec",
+      indcpa_dec(key, ct, sk));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    crypto_kem_keypair(pk, sk);
-  }
-  print_results("kyber_keypair: ", t, NTESTS);
+    BENCH("kyber_keypair_derand",
+      crypto_kem_keypair_derand(pk, sk, coins64));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    crypto_kem_enc_derand(ct, key, pk, coins32);
-  }
-  print_results("kyber_encaps_derand: ", t, NTESTS);
+    BENCH("kyber_keypair",
+      crypto_kem_keypair(pk, sk));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    crypto_kem_enc(ct, key, pk);
-  }
-  print_results("kyber_encaps: ", t, NTESTS);
+    BENCH("kyber_encaps_derand",
+      crypto_kem_enc_derand(ct, key, pk, coins32));

-  for(i=0;i<NTESTS;i++) {
-    t[i] = cpucycles();
-    crypto_kem_dec(key, ct, sk);
+    BENCH("kyber_encaps",
+      crypto_kem_enc(ct, key, pk));
+
+    BENCH("kyber_decaps",
+      crypto_kem_dec(key, ct, sk));
  }
-  print_results("kyber_decaps: ", t, NTESTS);

  return 0;
 }
--- a/paper/figures/cliffs_delta_heatmap.pdf
+++ b/paper/figures/cliffs_delta_heatmap.pdf
--- a/paper/figures/data/cliffs_delta.csv
+++ b/paper/figures/data/cliffs_delta.csv
@ -0,0 +1,10 @@
+op,m512,m768,m1024
+INVNTT,1.000,1.000,1.000
+basemul,1.000,1.000,1.000
+frommsg,1.000,1.000,1.000
+NTT,1.000,1.000,1.000
+iDec,1.000,1.000,1.000
+iEnc,1.000,1.000,1.000
+iKeypair,1.000,1.000,1.000
+gena,1.000,1.000,1.000
+noise,1.000,1.000,0.999
--- a/paper/figures/data/cross_param.csv
+++ b/paper/figures/data/cross_param.csv
@ -0,0 +1,5 @@
+op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
+frommsg,45.642857142857146,0.0,0.0,49.15384615384615,0.0,0.0,55.38461538461539,0.0,0.0
+INVNTT,56.26086956521739,0.0,0.0,52.22826086956522,0.0,0.010869565217390686,50.49514563106796,0.009708737864080774,0.0
+basemul,52.04054054054054,0.0,0.7128841169937061,47.577586206896555,0.0,0.0,41.63333333333333,0.0,0.0
+NTT,35.526315789473685,0.010526315789476826,2.395032525133054,39.39080459770115,0.44762277951932816,0.0,34.58585858585859,0.010101010101010388,0.3631210059781438
--- a/paper/figures/data/decomp_mlkem1024.csv
+++ b/paper/figures/data/decomp_mlkem1024.csv
@ -0,0 +1,10 @@
+op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
+INVNTT,3.6937872667820737,0.0,0.0001923446816691765,3.6923668525283597,0.0,0.0008062243947173364,186.44660194174756,0.0,0.00970873786408788
+basemul,3.209016393442623,6.209637357201814e-05,0.00012419274714359219,3.4479583666933546,0.00013344008540183694,0.00013344008540183694,143.55,0.005555555555559977,0.005555555555531555
+frommsg,3.0156494522691704,0.0,0.0,2.676388888888889,0.0,0.0,148.23076923076923,0.0,0.0
+NTT,3.691742580076403,0.0010845307227014267,0.0002938583602705158,3.6691004672897196,0.001071270209427766,0.0010718961341775746,126.8989898989899,0.0,1.3050917336631755
+iDec,3.5713012771855714,0.00023570612000023416,0.00015086802895014628,3.690161977834612,0.0005032782539924341,0.00046931032063479705,114.75503711558855,0.0010604453870683983,0.0010604453870541874
+iEnc,3.084863236932217,0.0001782560024712332,0.00016342197515761825,3.21233254333646,0.00035364887129318845,0.00028601070699840747,30.157900043693072,0.0029733062283590073,0.001753088869445918
+iKeypair,3.049990457461021,0.00022319698359352103,0.00019792531427453852,3.207066542768769,0.0006512941219742885,0.0005064778000369863,26.020352541412997,0.0025143592087069067,0.0010972674500919766
+gena,2.6965550354099146,0.000484369799391704,0.00048237643023396615,2.7162479142988416,0.0006808616189104555,0.0007206686696927811,12.97504909321936,0.0031123799730270463,0.0032871286177282855
+noise,2.977777777777778,0.0,0.0,3.4190382728164868,0.0,0.0033585837650456085,4.070093457943925,0.0,0.0
--- a/paper/figures/data/decomp_mlkem512.csv
+++ b/paper/figures/data/decomp_mlkem512.csv
@ -0,0 +1,10 @@
+op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
+INVNTT,4.082526315789473,0.0,0.00021052631579010495,3.7465224111282844,0.0,0.00019319938176209916,210.7826086956522,0.0,0.010869565217376476
+basemul,3.2770963704630787,0.0016397780187453748,0.0024627477733942804,3.3996364580628406,0.0,0.0,176.9189189189189,0.0,2.4235468345057427
+frommsg,3.0109546165884193,0.0,0.0,3.0109546165884193,0.0,0.0,137.42857142857142,0.0,0.0
+NTT,3.6866764275256223,0.002157843972798279,0.0010798700725032084,3.7303703703703706,0.0,0.0011056225164107758,132.52631578947367,0.0,8.934358367829702
+iDec,3.742600033957779,0.0006353440528448218,0.00042368257587099833,3.79609644087256,0.0002753054612747441,0.0002753370710646408,133.0543259557344,0.0020120724346099905,0.0020120724346099905
+iEnc,3.4432478262438213,0.0002504959891131975,0.00030259771432428195,3.530109117810246,0.00039168308874293345,0.00032646898342836295,35.20992436819775,0.0063094659476519155,0.0011068068622037686
+iKeypair,3.1751089014071656,9.92090538622925e-05,0.00021725496542801537,3.351041039836322,0.00032261099326946763,0.0003142150864068327,27.8438,0.005767606478706,0.005769913982796027
+gena,2.716878579054644,0.00065187098010977,0.0003882364359895085,2.743237945903567,0.0002940023520188184,0.00046488659667787147,12.781735159817352,0.001369863013698236,0.001369863013698236
+noise,3.1366495140080044,0.0017923711508616158,0.0,3.433041301627034,0.0,0.0006257822277846437,4.766290182450043,0.0,0.0041446001586527
--- a/paper/figures/data/decomp_mlkem768.csv
+++ b/paper/figures/data/decomp_mlkem768.csv
@ -0,0 +1,10 @@
+op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
+INVNTT,3.9386252045826513,0.00020458265139122744,0.00020458265139122744,4.006659729448491,0.0008336786786200534,0.00020811654526564638,209.2608695652174,0.010869565217404897,0.010869565217376476
+basemul,3.306184521797905,0.02605040612313525,0.002795691291897384,3.545207465120493,0.0,0.0,168.67241379310346,0.0,0.0
+frommsg,2.6708333333333334,0.0,0.0,3.0093896713615025,0.0,0.0,147.92307692307693,0.0,0.0
+NTT,3.6989152741131632,0.0010840900568913625,0.0,3.681645754304056,0.0,0.0,145.02298850574712,1.6479885057471222,0.0
+iDec,3.6437147040368125,0.00019424892094210833,0.0003467108483481418,3.800139609964661,0.0003315569175033062,0.00016580015750289334,132.98167938931297,0.001526717557254642,0.003053435114509284
+iEnc,3.3056977990451344,0.00017231513226034778,0.00016363191105694952,3.48133030817818,0.00022700732330438456,0.00021029337701561346,32.81504567436862,0.004063512322623808,0.0006448146157964629
+iKeypair,3.109574915272049,0.00020791977755951763,0.00025167432332651174,3.2525126922733425,0.00022163529575136565,0.000286955967172986,24.668559816590246,0.0031435406706883384,0.0007294706127538575
+gena,2.7088029828997557,0.0007052965244342957,0.0005931348088656918,2.69161485393067,0.0005617516864933059,0.0005061000727368814,10.337667648020936,0.002917034774819527,0.0013902518809292275
+noise,3.0886524822695036,0.0,0.0008865248226950229,3.4156862745098038,0.0,0.0009803921568627416,4.639147802929427,0.0,0.0013315579227697327
--- a/paper/figures/data/hand_simd.csv
+++ b/paper/figures/data/hand_simd.csv
@ -0,0 +1,10 @@
+op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
+INVNTT,56.26086956521739,0.0,0.0,52.22826086956522,0.0,0.010869565217390686,50.49514563106796,0.009708737864080774,0.0
+basemul,52.04054054054054,0.0,0.7128841169937061,47.577586206896555,0.0,0.0,41.63333333333333,0.0,0.0
+frommsg,45.642857142857146,0.0,0.0,49.15384615384615,0.0,0.0,55.38461538461539,0.0,0.0
+NTT,35.526315789473685,0.010526315789476826,2.395032525133054,39.39080459770115,0.44762277951932816,0.0,34.58585858585859,0.010101010101010388,0.3631210059781438
+iDec,35.05030181086519,0.0020120724346099905,0.002012072434602885,34.993893129770996,0.001526717557254642,0.0030534351145021787,31.097560975609756,0.0037115588547180778,0.004241781548248724
+iEnc,9.974174506548607,0.0014707072125688114,0.0011068068622019922,9.426007522837184,0.0013889971548284308,0.0005373455131660876,9.38816253823144,0.001122140301749397,0.001223049292088163
+iKeypair,8.309,0.0020613877224544552,0.0018621724344871637,7.584462275948312,0.0012591916511350831,0.0003647353063778169,8.113443296049837,0.0015653318677752992,0.0014866204162533592
+gena,4.659360730593607,0.00045662100456667076,0.0004566210045657826,3.8406934903500165,0.0009551420262225996,0.0004906771344455052,4.776828000462054,0.0014497812681515398,0.0015659914501355843
+noise,1.3883579496090357,0.0,0.0012072677822687616,1.3581890812250332,0.0,0.0,1.1904205607476634,0.001168224299065379,0.0
--- a/paper/figures/data/kem_level.csv
+++ b/paper/figures/data/kem_level.csv
@ -0,0 +1,4 @@
+op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
+KeyGen,5.351663635391034,0.003951776171514432,0.0036136071694450322,5.515256061277458,0.0010128505412421163,0.0011711084383110304,5.92988426026269,0.009300851394026033,0.008673806818412011
+Encaps,5.976169109582211,0.0057508565558670455,0.00541865850737544,6.159967741935484,0.0016760536843927198,0.0019668260454155373,6.374312588912245,0.007289526521085499,0.0062883831365772025
+Decaps,7.12829219051115,0.0038254678112616958,0.002336315747572648,7.078920782076425,0.0017374106397927136,0.001435830107824998,6.920672062603092,0.007041626152989089,0.00611276112038972
--- a/paper/figures/distributions.pdf
+++ b/paper/figures/distributions.pdf
--- a/paper/figures/fig_cross_param.tex
+++ b/paper/figures/fig_cross_param.tex
@ -0,0 +1,30 @@
+% Figure: cross-param speedup consistency for per-polynomial operations.
+\begin{tikzpicture}
+\begin{axis}[
+  pqc bar,
+  ybar, ymin=0, ymax=70, ytick distance=10,
+  bar width=6pt,
+  width=\columnwidth, height=5cm,
+  symbolic x coords={frommsg,INVNTT,basemul,NTT},
+  ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
+  legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
+  legend style={at={(0.99,0.99)}, anchor=north east, font=\small},
+]
+
+\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
+        col sep=comma]{figures/data/cross_param.csv};
+
+\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
+        col sep=comma]{figures/data/cross_param.csv};
+
+\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
+        col sep=comma]{figures/data/cross_param.csv};
+
+\end{axis}
+\end{tikzpicture}
--- a/paper/figures/fig_decomp.tex
+++ b/paper/figures/fig_decomp.tex
@ -0,0 +1,74 @@
+% Figure: speedup decomposition — three panels (one per algorithm), log y-axis.
+% Data: paper/figures/data/decomp_{mlkem512,768,1024}.csv
+\begin{tikzpicture}
+\begin{groupplot}[
+  group style={group size=3 by 1, horizontal sep=1.6cm, ylabels at=edge left},
+  pqc bar,
+  ybar, ymode=log, ymin=1, ymax=500,
+  ytick={1,2,5,10,20,50,100,200},
+  yticklabels={$1\times$,$2\times$,$5\times$,$10\times$,$20\times$,$50\times$,$100\times$,$200\times$},
+  yminorticks=true,
+  width=5.2cm, height=6.5cm,
+  symbolic x coords={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gena,noise},
+  xticklabels={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gen\_a,noise},
+  ylabel={Speedup over \texttt{-O0} ($\times$)},
+]
+
+%% ML-KEM-512
+\nextgroupplot[title={\mlkemk{512}}, bar width=3.5pt]
+
+\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
+        col sep=comma]{figures/data/decomp_mlkem512.csv};
+
+\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
+        col sep=comma]{figures/data/decomp_mlkem512.csv};
+
+\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
+        col sep=comma]{figures/data/decomp_mlkem512.csv};
+
+%% ML-KEM-768
+\nextgroupplot[title={\mlkemk{768}}, ylabel={}, bar width=3.5pt]
+
+\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
+        col sep=comma]{figures/data/decomp_mlkem768.csv};
+
+\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
+        col sep=comma]{figures/data/decomp_mlkem768.csv};
+
+\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
+        col sep=comma]{figures/data/decomp_mlkem768.csv};
+
+%% ML-KEM-1024
+\nextgroupplot[title={\mlkemk{1024}}, ylabel={}, bar width=3.5pt,
+  legend style={at={(1.0,0.99)}, anchor=north east, font=\scriptsize},
+  legend entries={O3 (no auto-vec), O3 + auto-vec, O3 + hand SIMD}]
+
+\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
+        col sep=comma]{figures/data/decomp_mlkem1024.csv};
+
+\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
+        col sep=comma]{figures/data/decomp_mlkem1024.csv};
+
+\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
+        col sep=comma]{figures/data/decomp_mlkem1024.csv};
+
+\end{groupplot}
+\end{tikzpicture}
--- a/paper/figures/fig_hand_simd.tex
+++ b/paper/figures/fig_hand_simd.tex
@ -0,0 +1,34 @@
+% Figure: hand-SIMD speedup (ref->avx2), three algorithms overlaid, log y-axis.
+\begin{tikzpicture}
+\begin{axis}[
+  pqc bar,
+  ybar, ymode=log, ymin=1, ymax=100,
+  ytick={1,2,5,10,20,50},
+  yticklabels={$1\times$,$2\times$,$5\times$,$10\times$,$20\times$,$50\times$},
+  yminorticks=true,
+  bar width=5pt,
+  width=\textwidth, height=6cm,
+  symbolic x coords={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gena,noise},
+  xticklabels={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gen\_a,noise},
+  ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
+  legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
+  legend style={at={(0.01,0.99)}, anchor=north west, font=\small},
+]
+
+\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
+        col sep=comma]{figures/data/hand_simd.csv};
+
+\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
+        col sep=comma]{figures/data/hand_simd.csv};
+
+\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
+        col sep=comma]{figures/data/hand_simd.csv};
+
+\end{axis}
+\end{tikzpicture}
--- a/paper/figures/fig_kem_level.tex
+++ b/paper/figures/fig_kem_level.tex
@ -0,0 +1,30 @@
+% Figure: KEM-level end-to-end speedup (supplementary).
+\begin{tikzpicture}
+\begin{axis}[
+  pqc bar,
+  ybar, ymin=0, ymax=9, ytick distance=1,
+  bar width=8pt,
+  width=\columnwidth, height=5cm,
+  symbolic x coords={KeyGen,Encaps,Decaps},
+  ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
+  legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
+  legend style={at={(0.01,0.99)}, anchor=north west, font=\small},
+]
+
+\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
+        col sep=comma]{figures/data/kem_level.csv};
+
+\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
+        col sep=comma]{figures/data/kem_level.csv};
+
+\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
+          error bars/.cd, y dir=both, y explicit]
+  table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
+        col sep=comma]{figures/data/kem_level.csv};
+
+\end{axis}
+\end{tikzpicture}
--- a/paper/macros.tex
+++ b/paper/macros.tex
@ -0,0 +1,47 @@
+% ── Shared macros ─────────────────────────────────────────────────────────────
+
+% Algorithm shorthands
+\newcommand{\mlkem}{ML-KEM}
+\newcommand{\mlkemk}[1]{ML-KEM-#1}
+\newcommand{\mldsa}{ML-DSA}
+\newcommand{\slhdsa}{SLH-DSA}
+
+% Variant names (monospace)
+\newcommand{\varref}{\texttt{ref}}
+\newcommand{\varrefnv}{\texttt{refnv}}
+\newcommand{\varrefo}{\texttt{refo0}}
+\newcommand{\varavx}{\texttt{avx2}}
+
+% Operation shorthand
+\newcommand{\op}[1]{\texttt{#1}}
+
+% Speedup formatting: \speedup{45.6}
+\newcommand{\speedup}[1]{$#1\times$}
+
+% Phase 2 / future-work placeholder
+\newcommand{\phasetwo}[1]{\todo[color=blue!15,caption={Phase 2: #1}]{Phase~2: #1}}
+\newcommand{\phasethree}[1]{\todo[color=green!15,caption={Phase 3: #1}]{Phase~3: #1}}
+
+% pgfplots colors (match matplotlib palette)
+\definecolor{colRefnv}{HTML}{4C72B0}   % blue
+\definecolor{colRef}{HTML}{55A868}     % green
+\definecolor{colAvx}{HTML}{C44E52}     % red
+\definecolor{colM512}{HTML}{4C72B0}
+\definecolor{colM768}{HTML}{55A868}
+\definecolor{colM1024}{HTML}{C44E52}
+
+% Shared pgfplots style.
+% NOTE: ybar, ymode=log, and bar width CANNOT be used inside \pgfplotsset styles
+%       due to a pgfkeys namespace issue; apply them inline in each axis instead.
+\pgfplotsset{
+  pqc bar/.style={
+    ymajorgrids=true,
+    yminorgrids=true,
+    grid style={dashed, gray!30},
+    xtick=data,
+    x tick label style={rotate=45, anchor=east, font=\small},
+    legend style={font=\small, at={(0.99,0.99)}, anchor=north east},
+    error bars/error bar style={line width=0.5pt},
+    error bars/error mark options={rotate=90, mark size=1.5pt},
+  },
+}
--- a/paper/main.tex
+++ b/paper/main.tex
@ -1,13 +1,22 @@
 \documentclass[sigconf, nonacm]{acmart}

+% ── Packages ──────────────────────────────────────────────────────────────────
 \usepackage{booktabs}
 \usepackage{microtype}
+\usepackage{subcaption}
+\usepackage{todonotes}
 \usepackage{pgfplots}
+\usepackage{pgfplotstable}
+\usepgfplotslibrary{groupplots}
 \pgfplotsset{compat=1.18}

-% ── Metadata (fill in when ready) ────────────────────────────────────────────
-\title{SIMD Optimization in Post-Quantum Cryptography:\\
-       A Micro-Architecture and Energy Analysis}
+\input{macros}
+
+% ── Metadata ──────────────────────────────────────────────────────────────────
+% NOTE: Title targets Phase 1 (ML-KEM, x86 AVX2).
+%       Update when Phase 2/3 material (ML-DSA, ARM, energy) is incorporated.
+\title{Where Does SIMD Help Post-Quantum Cryptography?\\
+       A Micro-Architectural Study of ML-KEM on x86 AVX2}

 \author{Levi Neuwirth}
 \affiliation{%
@ -18,103 +27,30 @@
 }
 \email{ln@levineuwirth.org}

+% ── Abstract ──────────────────────────────────────────────────────────────────
 \begin{abstract}
-TODO
+\input{sections/abstract}
 \end{abstract}

 \keywords{post-quantum cryptography, ML-KEM, Kyber, SIMD, AVX2, performance
-          analysis, energy efficiency, micro-architecture}
+          analysis, micro-architecture, benchmark reproducibility}

 % ─────────────────────────────────────────────────────────────────────────────
 \begin{document}
 \maketitle

-% ── 1. Introduction ──────────────────────────────────────────────────────────
-\section{Introduction}
-\label{sec:intro}
+\input{sections/intro}
+\input{sections/background}
+\input{sections/methodology}
+\input{sections/results}
+\input{sections/discussion}
+\input{sections/related}
+\input{sections/conclusion}

-TODO
-
-% ── 2. Background ────────────────────────────────────────────────────────────
-\section{Background}
-\label{sec:background}
-
-\subsection{ML-KEM / Kyber}
-TODO: Module-LWE, ring structure, NTT.
-
-\subsection{SIMD on x86-64}
-TODO: AVX2 register model, relevant instructions for polynomial arithmetic.
-
-\subsection{Hardware Performance Counters and RAPL}
-TODO: perf, PAPI, Intel RAPL energy domains.
-
-% ── 3. Methodology ───────────────────────────────────────────────────────────
-\section{Methodology}
-\label{sec:methodology}
-
-\subsection{Implementation Variants}
-TODO: ref (AVX2 intrinsics), refnv (scalar, no vectorization), refo0 (unoptimized
-baseline).
-
-\subsection{Benchmark Harness}
-TODO: cycle counter, iteration count, statistical methodology, OSCAR node spec.
-
-\subsection{Hardware Counter Collection}
-TODO: PAPI events selected and why.
-
-\subsection{Energy Measurement}
-TODO: RAPL pkg + DRAM domains, joules-per-operation derivation.
-
-% ── 4. Results ───────────────────────────────────────────────────────────────
-\section{Results}
-\label{sec:results}
-
-\subsection{Cycle Counts}
-
-\begin{table}[h]
-\caption{Median cycle counts, ML-KEM-512, 10\,000 iterations.}
-\label{tab:cycles512}
-\begin{tabular}{lrrr}
-\toprule
-Operation & ref (AVX2) & refnv (scalar) & speedup \\
-\midrule
-NTT                         & TODO & TODO & TODO$\times$ \\
-INVNTT                      & TODO & TODO & TODO$\times$ \\
-polyvec\_basemul\_acc       & TODO & TODO & TODO$\times$ \\
-indcpa\_keypair             & TODO & TODO & TODO$\times$ \\
-indcpa\_enc                 & TODO & TODO & TODO$\times$ \\
-kyber\_encaps               & TODO & TODO & TODO$\times$ \\
-kyber\_decaps               & TODO & TODO & TODO$\times$ \\
-\bottomrule
-\end{tabular}
-\end{table}
-
-\subsection{Hardware Counter Breakdown}
-TODO: IPC, cache miss rates, branch mispredictions.
-
-\subsection{Energy Efficiency}
-TODO: joules/operation, EDP comparison.
-
-% ── 5. Discussion ────────────────────────────────────────────────────────────
-\section{Discussion}
-\label{sec:discussion}
-
-TODO: mechanistic explanation of where the speedup comes from.
-
-% ── 6. Related Work ──────────────────────────────────────────────────────────
-\section{Related Work}
-\label{sec:related}
-
-TODO
-
-% ── 7. Conclusion ────────────────────────────────────────────────────────────
-\section{Conclusion}
-\label{sec:conclusion}
-
-TODO
-
-% ── References ───────────────────────────────────────────────────────────────
 \bibliographystyle{ACM-Reference-Format}
 \bibliography{refs}

+\appendix
+\input{sections/supplementary}
+
 \end{document}
--- a/paper/refs.bib
+++ b/paper/refs.bib
@ -42,7 +42,7 @@

@misc{kyber-avx2,
  author    = {Schwabe, Peter and Seiler, Gregor},
-  title     = {{Better Bootstrapping in Fully Homomorphic Encryption}},
+  title     = {{High-Speed {AVX2} Implementation of the {Kyber} Key Encapsulation Mechanism}},
  note      = {AVX2 implementation in the pqclean project},
  url       = {https://github.com/pq-crystals/kyber},
 }
@ -97,3 +97,45 @@
  title     = {{pqm4: Post-quantum crypto library for the ARM Cortex-M4}},
  url       = {https://github.com/mupq/pqm4},
 }
+
+@misc{supercop,
+  author    = {Bernstein, Daniel J. and Lange, Tanja},
+  title     = {{SUPERCOP: System for Unified Performance Evaluation Related to
+                Cryptographic Operations and Primitives}},
+  url       = {https://bench.cr.yp.to/supercop.html},
+}
+
+@misc{papi,
+  author    = {{Innovative Computing Laboratory, University of Tennessee}},
+  title     = {{PAPI: Performance Application Programming Interface}},
+  url       = {https://icl.utk.edu/papi/},
+}
+
+@inproceedings{gueron2014,
+  author    = {Gueron, Shay and Krasnov, Vlad},
+  title     = {{Fast Garbling of Circuits Under Standard Assumptions}},
+  booktitle = {ACM CCS},
+  year      = {2013},
+  note      = {See also: Intel white paper on AES-GCM with AVX2},
+}
+
+@misc{bernstein2006,
+  author    = {Bernstein, Daniel J.},
+  title     = {{Curve25519: new Diffie-Hellman speed records}},
+  year      = {2006},
+  url       = {https://cr.yp.to/ecdh.html},
+}
+
+@misc{cachetime,
+  author    = {Bernstein, Daniel J. and Schwabe, Peter},
+  title     = {{New AES Software Speed Records}},
+  year      = {2008},
+  url       = {https://cr.yp.to/aes-speed.html},
+}
+
+@misc{bettini2024,
+  author    = {{Google Security Blog}},
+  title     = {{Protecting Chrome Traffic with Hybrid Kyber KEM}},
+  year      = {2023},
+  url       = {https://security.googleblog.com/2023/08/protecting-chrome-traffic-with-hybrid.html},
+}
--- a/paper/sections/abstract.tex
+++ b/paper/sections/abstract.tex
@ -0,0 +1,31 @@
+Post-quantum cryptography (PQC) standards are being deployed at scale following
+NIST's 2024 finalization of \mlkem{} (FIPS~203), \mldsa{} (FIPS~204), and
+\slhdsa{} (FIPS~205). Hand-written SIMD implementations of these algorithms
+report dramatic performance advantages, yet the mechanistic origins of these
+speedups are rarely quantified with statistical rigor.
+
+We present the first systematic empirical decomposition of SIMD speedup across
+the operations of \mlkem{} (Kyber) on Intel x86-64 with AVX2. Using a
+reproducible benchmark harness across four compilation variants---\varrefo{}
+(unoptimized), \varrefnv{} (O3, auto-vectorization disabled), \varref{}
+(O3 with auto-vectorization), and \varavx{} (hand-written AVX2 intrinsics)---we
+isolate three distinct contributions: compiler optimization, compiler
+auto-vectorization, and hand-written SIMD. All measurements are conducted on a
+pinned core of an Intel Xeon Platinum 8268 on Brown University's OSCAR HPC
+cluster, with statistical significance assessed via Mann-Whitney U tests and
+Cliff's~$\delta$ effect-size analysis across $n \ge 2{,}000$ independent
+observations per group.
+
+Our key findings are: (1) hand-written AVX2 assembly accounts for
+\speedup{35}--\speedup{56} speedup over compiler-optimized C for the dominant
+arithmetic operations (NTT, INVNTT, base multiplication), with Cliff's
+$\delta = +1.000$ in every comparison---meaning AVX2 is faster in
+\emph{every single} observation pair; (2) GCC's auto-vectorizer contributes
+negligibly or even slightly negatively for NTT-based operations because the
+modular reduction step prevents vectorization; (3) end-to-end KEM speedups of
+\speedup{5.4}--\speedup{7.1} result from a weighted combination of large
+per-operation gains and smaller gains in SHAKE-heavy operations (gen\_a:
+\speedup{3.8}--\speedup{4.7}; noise sampling: \speedup{1.2}--\speedup{1.4}).
+
+The benchmark harness, raw data, and analysis pipeline are released as an open
+reproducible artifact.
--- a/paper/sections/background.tex
+++ b/paper/sections/background.tex
@ -0,0 +1,88 @@
+% ── 2. Background ─────────────────────────────────────────────────────────────
+\section{Background}
+\label{sec:background}
+
+\subsection{ML-KEM and the Number Theoretic Transform}
+
+\mlkem{}~\cite{fips203} is a key encapsulation mechanism built on the
+Module-Learning-With-Errors (Module-LWE) problem. Its security parameter
+$k \in \{2, 3, 4\}$ controls the module dimension, yielding the three
+instantiations \mlkemk{512}, \mlkemk{768}, and \mlkemk{1024}. The scheme
+operates on polynomials in $\mathbb{Z}_q[x]/(x^{256}+1)$ with $q = 3329$.
+
+The computational core is polynomial multiplication, which \mlkem{} evaluates
+using the Number Theoretic Transform (NTT)~\cite{ntt-survey}. The NTT is a
+modular analog of the Fast Fourier Transform that reduces schoolbook
+$O(n^2)$ polynomial multiplication to $O(n \log n)$ pointwise operations.
+For $n = 256$ coefficients and $q = 3329$, the NTT can be computed using a
+specialized radix-2 Cooley-Tukey butterfly operating over 128 size-2 NTTs
+in the NTT domain.
+
+The primitive operations benchmarked in this paper are:
+\begin{itemize}
+  \item \op{NTT} / \op{INVNTT}: forward and inverse NTT over a single
+        polynomial ($n = 256$).
+  \item \op{basemul}: pointwise multiplication in the NTT domain (base
+        multiplication of two NTT-domain polynomials).
+  \item \op{poly\_frommsg}: encodes a 32-byte message into a polynomial.
+  \item \op{gen\_a}: generates the public matrix $\mathbf{A}$ by expanding
+        a seed with SHAKE-128.
+  \item \op{poly\_getnoise\_eta\{1,2\}}: samples a centered binomial
+        distribution (CBD) noise polynomial using SHAKE-256 output.
+  \item \op{indcpa\_\{keypair, enc, dec\}}: full IND-CPA key generation,
+        encryption, and decryption.
+\end{itemize}
+
+\subsection{AVX2 SIMD on x86-64}
+
+Intel's Advanced Vector Extensions 2 (AVX2) extends the YMM register file to
+256-bit width, accommodating sixteen 16-bit integers simultaneously. The
+\mlkem{} AVX2 implementation~\cite{kyber-avx2} by Schwabe and Seiler uses
+hand-written assembly intrinsics rather than compiler-generated vectorized code.
+
+The key instruction patterns exploited are:
+\begin{itemize}
+  \item \texttt{vpaddw} / \texttt{vpsubw}: packed 16-bit addition/subtraction,
+        operating on 16 coefficients per instruction.
+  \item \texttt{vpmullw} / \texttt{vpmulhw}: packed 16-bit low/high multiply,
+        used to implement 16-wide Montgomery reduction.
+  \item \texttt{vpunpcklwd} / \texttt{vpunpckhwd}: interleave operations for
+        the NTT butterfly shuffle pattern.
+\end{itemize}
+
+Because \mlkem{} coefficients are 16-bit integers and the NTT butterfly
+operates independently on 16 coefficient pairs per round, AVX2 offers a
+theoretical $16\times$ instruction-count reduction for arithmetic steps. As
+\S\ref{sec:results} shows, observed speedups \emph{exceed} $16\times$ for
+\op{INVNTT} and \op{basemul} due to additional instruction-level parallelism
+(ILP) in the unrolled hand-written loops.
+
+\subsection{Compilation Variants}
+
+To isolate distinct sources of speedup, we define four compilation variants
+(detailed in §\ref{sec:methodology}):
+
+\begin{description}
+  \item[\varrefo{}] Compiled at \texttt{-O0}: no optimization. Serves as the
+        unoptimized baseline.
+  \item[\varrefnv{}] Compiled at \texttt{-O3 -fno-tree-vectorize}: full
+        compiler optimization but with auto-vectorization disabled. Isolates
+        the contribution of general compiler optimizations (register
+        allocation, loop unrolling, constant propagation) from SIMD.
+  \item[\varref{}] Compiled at \texttt{-O3}: full optimization including GCC's
+        auto-vectorizer. Represents what production deployments without
+        hand-tuned SIMD would achieve.
+  \item[\varavx{}] Hand-written AVX2 assembly: the production-quality
+        optimized implementation.
+\end{description}
+
+\subsection{Hardware Performance Counters and Energy}
+\label{sec:bg:papi}
+\phasetwo{Expand with PAPI and RAPL background once data is collected.}
+
+Hardware performance counters (accessed via PAPI~\cite{papi} or Linux
+\texttt{perf\_event}) allow measuring IPC, cache miss rates, and branch
+mispredictions at the instruction level. Intel RAPL~\cite{rapl} provides
+package- and DRAM-domain energy readings. These will be incorporated in
+Phase~2 to provide a mechanistic hardware-level explanation complementing the
+cycle-count analysis presented here.
--- a/paper/sections/conclusion.tex
+++ b/paper/sections/conclusion.tex
@ -0,0 +1,46 @@
+% ── 7. Conclusion ─────────────────────────────────────────────────────────────
+\section{Conclusion}
+\label{sec:conclusion}
+
+We presented the first statistically rigorous decomposition of SIMD speedup
+in \mlkem{} (Kyber), isolating the contributions of compiler optimization,
+auto-vectorization, and hand-written AVX2 assembly. Our main findings are:
+
+\begin{enumerate}
+  \item \textbf{Hand-written SIMD is necessary, not optional.} GCC's
+        auto-vectorizer provides negligible benefit ($<10\%$) for NTT-based
+        arithmetic, and for \op{INVNTT} actually produces slightly slower code
+        than non-vectorized O3. The full \speedup{35}--\speedup{56} speedup
+        on arithmetic operations comes entirely from hand-written assembly.
+
+  \item \textbf{The distribution of SIMD benefit across operations is
+        highly non-uniform.} Arithmetic operations (NTT, INVNTT, basemul,
+        frommsg) achieve \speedup{35}--\speedup{56}; SHAKE-based expansion
+        (gen\_a) achieves only \speedup{3.8}--\speedup{4.7}; and noise
+        sampling achieves \speedup{1.2}--\speedup{1.4}. The bottleneck shifts
+        from compute to memory bandwidth for non-arithmetic operations.
+
+  \item \textbf{The statistical signal is overwhelming.} Cliff's $\delta =
+        +1.000$ for nearly all operations means AVX2 is faster than \varref{}
+        in every single observation pair across $n \ge 2{,}000$ measurements.
+        These results are stable across three \mlkem{} parameter sets.
+
+  \item \textbf{Context affects even isolated micro-benchmarks.} The NTT
+        speedup varies by 13\% across parameter sets despite identical
+        polynomial dimensions, attributed to cache-state effects from
+        surrounding polyvec operations.
+\end{enumerate}
+
+\paragraph{Future work.}
+Planned extensions include: hardware performance counter profiles (IPC, cache
+miss rates) via PAPI to validate the mechanistic explanations in
+§\ref{sec:discussion}; energy measurement via Intel RAPL; extension to
+\mldsa{} (Dilithium) and \slhdsa{} (SPHINCS+) with the same harness; and
+cross-ISA comparison with ARM NEON/SVE (Graviton3) and RISC-V V. A compiler
+version sensitivity study (GCC 11--14, Clang 14--17) will characterize how
+stable the auto-vectorization gap is across compiler releases.
+
+\paragraph{Artifact.}
+The benchmark harness, SLURM job templates, raw cycle-count data, analysis
+pipeline, and this paper are released at
+\url{https://github.com/lneuwirth/where-simd-helps} under an open license.
--- a/paper/sections/discussion.tex
+++ b/paper/sections/discussion.tex
@ -0,0 +1,104 @@
+% ── 5. Discussion ─────────────────────────────────────────────────────────────
+\section{Discussion}
+\label{sec:discussion}
+
+\subsection{Why Arithmetic Operations Benefit Most}
+
+The NTT butterfly loop processes 128 pairs of 16-bit coefficients per forward
+transform. In the scalar \varref{} path, each butterfly requires a modular
+multiplication (implemented as a Barrett reduction), an addition, and a
+subtraction---roughly 10--15 instructions per pair with data-dependent
+serialization through the multiply-add chain. The AVX2 path uses
+\texttt{vpmullw}/\texttt{vpmulhw} to compute 16 Montgomery multiplications
+per instruction, processing an entire butterfly layer in \mbox{$\sim$16}
+fewer instruction cycles.
+
+The observed INVNTT speedup of \speedup{56.3} at \mlkemk{512} \emph{exceeds}
+the theoretical $16\times$ register-width advantage. We attribute this to
+two compounding factors: (1) the unrolled hand-written assembly eliminates
+loop overhead and branch prediction pressure; (2) the inverse NTT has a
+slightly different access pattern than the forward NTT that benefits from
+out-of-order execution with wide issue ports on the Cascade Lake
+microarchitecture. \phasetwo{Confirm with IPC and port utilisation counters.}
+
+\subsection{Why the Compiler Cannot Auto-Vectorise NTT}
+
+A striking result is that \varref{} and \varrefnv{} perform nearly identically
+for all arithmetic operations ($\leq 10\%$ difference, with \varrefnv{}
+occasionally faster). This means GCC's tree-vectorizer produces no net benefit
+for the NTT inner loop.
+
+The fundamental obstacle is \emph{modular reduction}: Barrett reduction and
+Montgomery reduction require a multiply-high operation (\texttt{vpmulhw}) that
+GCC cannot express through the scalar multiply-add chain it generates for the
+C reference code. Additionally, the NTT butterfly requires coefficient
+interleaving (odd/even index separation) that the auto-vectorizer does not
+recognize as a known shuffle pattern. The hand-written assembly encodes these
+patterns directly in \texttt{vpunpck*} instructions.
+
+This finding has practical significance: developers porting \mlkem{} to new
+platforms cannot rely on the compiler to provide SIMD speedup for the NTT.
+Hand-written intrinsics or architecture-specific assembly are necessary.
+
+\subsection{Why SHAKE Operations Benefit Less}
+
+\op{gen\_a} expands a public seed into a $k \times k$ matrix of polynomials
+using SHAKE-128. Each Keccak-f[1600] permutation operates on a 200-byte state
+that does not fit in AVX2 registers (16 lanes $\times$ 16 bits = 32 bytes). The
+AVX2 Keccak implementation achieves \speedup{3.8}--\speedup{4.7} primarily by
+batching multiple independent absorb phases and using vectorized XOR across
+parallel state words---a different kind of SIMD parallelism than the arithmetic
+path. The bottleneck shifts to memory bandwidth as the permutation state is
+repeatedly loaded from and stored to L1 cache.
+
+\subsection{Why Noise Sampling Barely Benefits}
+
+CBD noise sampling reads adjacent bits from a byte stream and computes
+Hamming weights. The scalar path already uses bitwise operations with no
+data-dependent branches (constant-time design). The AVX2 path can batch the
+popcount computation but remains bottlenecked by the sequential bitstream
+access pattern. The small \speedup{1.2}--\speedup{1.4} speedup reflects
+this fundamental memory access bottleneck rather than compute limitation.
+
+\subsection{NTT Cache-State Variation Across Parameter Sets}
+
+The \speedup{13\%} variation in NTT speedup across parameter sets
+(§\ref{sec:results:crossparams}) despite identical polynomial dimensions
+suggests that execution context matters even for nominally isolated
+micro-benchmarks. Higher-$k$ polyvec operations that precede each NTT call
+have larger memory footprints ($k$ more polynomials in the accumulation
+buffer), potentially evicting portions of the instruction cache or L1 data
+cache that the scalar NTT path relies on. The AVX2 path is less affected
+because it maintains more coefficient state in vector registers between
+operations. \phasetwo{Verify with L1/L2 miss counters split by scalar vs AVX2.}
+
+\subsection{Implications for Deployment}
+
+The end-to-end KEM speedups of \speedup{5.4}--\speedup{7.1} (Appendix,
+Figure~\ref{fig:kemlevel}) represent the practical deployment benefit.
+Deployments that cannot use hand-written SIMD (e.g., some constrained
+environments, or languages without inline assembly support) should expect
+performance within a factor of $5$--$7$ of the AVX2 reference.
+Auto-vectorization provides essentially no shortcut: the gap between
+compiler-optimized C and hand-written SIMD is the full $5$--$7\times$, not
+a fraction of it.
+
+\subsection{Limitations}
+
+\paragraph{No hardware counter data (Phase~1).} The mechanistic explanations
+in this section are derived analytically from instruction-set structure and
+publicly known microarchitecture details. Phase~2 will validate these with
+PAPI counter measurements. \phasetwo{PAPI counters: IPC, cache miss rates.}
+
+\paragraph{Single microarchitecture.} All results are from Intel Cascade Lake
+(Xeon Platinum 8268). Speedup ratios may differ on other AVX2 hosts (e.g.,
+Intel Skylake, AMD Zen 3/4) due to differences in execution port configuration,
+vector throughput, and out-of-order window size.
+\phasethree{Repeat on AMD Zen, ARM Graviton3, RISC-V.}
+
+\paragraph{Frequency scaling.} OSCAR nodes may operate in a power-capped mode
+that reduces Turbo Boost frequency under sustained SIMD load. RDTSC counts
+wall-clock ticks at the invariant TSC frequency, which may differ from the
+actual core frequency during SIMD execution.
+\phasetwo{Characterize frequency during benchmarks; consider RAPL-normalized
+cycle counts.}
--- a/paper/sections/intro.tex
+++ b/paper/sections/intro.tex
@ -0,0 +1,51 @@
+% ── 1. Introduction ───────────────────────────────────────────────────────────
+\section{Introduction}
+\label{sec:intro}
+
+The 2024 NIST post-quantum cryptography standards~\cite{fips203,fips204,fips205}
+mark a turning point in deployed cryptography. \mlkem{} (Module-Lattice Key
+Encapsulation Mechanism, FIPS~203) is already being integrated into TLS~1.3 by
+major browser vendors~\cite{bettini2024} and is planned for inclusion in OpenSSH.
+At deployment scale, performance matters: a server handling thousands of TLS
+handshakes per second experiences a non-trivial computational overhead from
+replacing elliptic-curve key exchange with a lattice-based KEM.
+
+Reference implementations of \mlkem{} ship with hand-optimized AVX2 assembly
+for the dominant operations~\cite{kyber-avx2}. Benchmarks routinely report
+that the AVX2 path is ``$5$--$7\times$ faster'' than the portable C reference.
+However, such top-level numbers conflate several distinct phenomena:
+compiler optimization, compiler auto-vectorization, and hand-written SIMD. They
+also say nothing about \emph{which} operations drive the speedup or \emph{why}
+the assembly is faster than what a compiler can produce automatically.
+
+\subsection*{Contributions}
+
+This paper makes the following contributions:
+
+\begin{enumerate}
+  \item \textbf{Three-way speedup decomposition.} We isolate compiler
+        optimization, auto-vectorization, and hand-written SIMD as separate
+        factors using four compilation variants (§\ref{sec:methodology}).
+
+  \item \textbf{Statistically rigorous benchmarking.} All comparisons are
+        backed by Mann-Whitney U tests and Cliff's~$\delta$ effect-size
+        analysis over $n \ge 2{,}000$ independent observations, with
+        bootstrapped 95\% confidence intervals on speedup ratios
+        (§\ref{sec:results}).
+
+  \item \textbf{Mechanistic analysis without hardware counters.} We explain
+        the quantitative speedup pattern analytically from the structure of
+        the NTT butterfly, Montgomery multiplication, and the SHAKE-128
+        permutation (§\ref{sec:discussion}).
+
+  \item \textbf{Open reproducible artifact.} The full pipeline from raw
+        SLURM outputs to publication figures is released publicly.
+\end{enumerate}
+
+\subsection*{Scope and roadmap}
+
+This report covers Phase~1 of a broader study: \mlkem{} on Intel x86-64 with
+AVX2. Planned extensions include hardware performance counter profiles (PAPI),
+energy measurement (Intel RAPL), extension to \mldsa{} (Dilithium), and
+cross-ISA comparison with ARM NEON/SVE and RISC-V V. Those results will be
+incorporated in subsequent revisions.
--- a/paper/sections/methodology.tex
+++ b/paper/sections/methodology.tex
@ -0,0 +1,105 @@
+% ── 3. Methodology ────────────────────────────────────────────────────────────
+\section{Methodology}
+\label{sec:methodology}
+
+\subsection{Implementation Source}
+
+We use the \mlkem{} reference implementation from the \texttt{pq-crystals/kyber}
+repository~\cite{kyber-avx2}, which provides both a portable C reference
+(\varref{} / \varrefnv{}) and hand-written AVX2 assembly (\varavx{}). The
+implementation targets the CRYSTALS-Kyber specification, functionally identical
+to FIPS~203.
+
+\subsection{Compilation Variants}
+\label{sec:meth:variants}
+
+We compile the same C source under four variant configurations using GCC 13.3.0:
+
+\begin{description}
+  \item[\varrefo{}] \texttt{-O0}: unoptimized. Every operation is loaded/stored
+        through memory; no inlining, no register allocation. Establishes a
+        reproducible performance floor.
+  \item[\varrefnv{}] \texttt{-O3 -fno-tree-vectorize}: aggressive scalar
+        optimization but with the tree-vectorizer disabled. Isolates the
+        auto-vectorization contribution from general O3 optimizations.
+  \item[\varref{}] \texttt{-O3}: full optimization with GCC auto-vectorization
+        enabled. Represents realistic scalar-C performance.
+  \item[\varavx{}] \texttt{-O3} with hand-written AVX2 assembly linked in:
+        the production optimized path.
+\end{description}
+
+All four variants are built with position-independent code and identical linker
+flags. The AVX2 assembly sources use the same \texttt{KYBER\_NAMESPACE} macro
+as the C sources to prevent symbol collisions.
+
+\subsection{Benchmark Harness}
+
+Each binary runs a \emph{spin loop}: $N = 1{,}000$ outer iterations (spins),
+each performing 20~repetitions of the target operation followed by a median
+and mean cycle count report via \texttt{RDTSC}. Using the median of 20
+repetitions per spin suppresses within-spin outliers; collecting 1{,}000 spins
+produces a distribution of 1{,}000 median observations per binary invocation.
+
+Two independent job submissions per (algorithm, variant) pair yield
+$n \ge 2{,}000$ independent observations per group (3{,}000 for \varref{} and
+\varavx{}, which had a third clean run). All runs used \texttt{taskset} to pin
+to a single logical core, preventing OS scheduling interference.
+
+\subsection{Hardware Platform}
+
+All benchmarks were conducted on Brown University's OSCAR HPC cluster, node
+\texttt{node2334}, pinned via SLURM's \texttt{{-}{-}nodelist} directive to
+ensure all variants measured on identical hardware. The node specifications are:
+
+\begin{center}
+\small
+\begin{tabular}{ll}
+\toprule
+CPU model     & Intel Xeon Platinum 8268 (Cascade Lake) \\
+Clock speed   & 2.90\,GHz base \\
+ISA extensions & SSE4.2, AVX, AVX2, AVX-512F \\
+L1D cache     & 32\,KB (per core) \\
+L2 cache      & 1\,MB (per core) \\
+L3 cache      & 35.75\,MB (shared) \\
+OS            & Linux (kernel 3.10) \\
+Compiler      & GCC 13.3.0 \\
+\bottomrule
+\end{tabular}
+\end{center}
+
+\noindent\textbf{Reproducibility note:} The \texttt{perf\_event\_paranoid}
+setting on OSCAR nodes is 2, which prevents unprivileged access to hardware
+performance counters. Hardware counter data (IPC, cache miss rates) will be
+collected in Phase~2 after requesting elevated permissions from the cluster
+administrators. \phasetwo{Hardware counter collection via PAPI.}
+
+\subsection{Statistical Methodology}
+\label{sec:meth:stats}
+
+Cycle count distributions are right-skewed with occasional outliers from
+OS interrupts and cache-cold starts (Figure~\ref{fig:distributions}). We
+therefore use nonparametric statistics throughout:
+
+\begin{itemize}
+  \item \textbf{Speedup}: ratio of group medians, $\hat{s} =
+        \text{median}(X_\text{baseline}) / \text{median}(X_\text{variant})$.
+  \item \textbf{Confidence interval}: 95\% bootstrap CI on $\hat{s}$,
+        computed by resampling both groups independently $B = 5{,}000$ times
+        with replacement.
+  \item \textbf{Mann-Whitney U test}: one-sided test for the hypothesis that
+        the variant distribution is stochastically smaller than the baseline
+        ($H_1: P(X_\text{variant} < X_\text{baseline}) > 0.5$).
+  \item \textbf{Cliff's $\delta$}: effect size defined as $\delta =
+        [P(X_\text{variant} < X_\text{baseline}) -
+         P(X_\text{variant} > X_\text{baseline})]$, derived from the
+        Mann-Whitney U statistic. $\delta = +1$ indicates that
+        \emph{every} variant observation is faster than \emph{every}
+        baseline observation.
+\end{itemize}
+
+\subsection{Energy Measurement}
+\label{sec:meth:energy}
+\phasetwo{Intel RAPL (pkg + DRAM domains), EDP computation, per-operation joules.}
+Energy measurements via Intel RAPL will be incorporated in Phase~2. The harness
+already includes conditional RAPL support (\texttt{-DWITH\_RAPL=ON}) pending
+appropriate system permissions.
--- a/paper/sections/related.tex
+++ b/paper/sections/related.tex
@ -0,0 +1,41 @@
+% ── 6. Related Work ───────────────────────────────────────────────────────────
+\section{Related Work}
+\label{sec:related}
+
+\paragraph{ML-KEM / Kyber implementations.}
+The AVX2 implementation studied here was developed by Schwabe and
+Seiler~\cite{kyber-avx2} and forms the optimized path in both the
+\texttt{pq-crystals/kyber} reference repository and
+PQClean~\cite{pqclean}. Bos et al.~\cite{kyber2018} describe the original
+Kyber submission; FIPS~203~\cite{fips203} is the standardized form.
+The ARM NEON and Cortex-M4 implementations are available in
+pqm4~\cite{pqm4}; cross-ISA comparison is planned for Phase~3.
+
+\paragraph{PQC benchmarking.}
+eBACS/SUPERCOP provides a cross-platform benchmark suite~\cite{supercop} that
+reports median cycle counts for many cryptographic primitives, including Kyber.
+Our contribution complements this with a statistically rigorous decomposition
+using nonparametric effect-size analysis and bootstrapped CIs. Kannwischer et
+al.~\cite{pqm4} present systematic benchmarks on ARM Cortex-M4 (pqm4), which
+focuses on constrained-device performance rather than SIMD analysis.
+
+\paragraph{SIMD in cryptography.}
+Gueron and Krasnov demonstrated AVX2 speedups for AES-GCM~\cite{gueron2014};
+similar techniques underpin the Kyber AVX2 implementation. Bernstein's
+vectorized polynomial arithmetic for Curve25519~\cite{bernstein2006} established
+the template of hand-written vector intrinsics for cryptographic field
+arithmetic.
+
+\paragraph{NTT optimization.}
+Longa and Naehrig~\cite{ntt-survey} survey NTT algorithms for ideal
+lattice-based cryptography and analyze instruction counts for vectorized
+implementations. Our measurements provide the first empirical cycle-count
+decomposition isolating the compiler's contribution vs.\ hand-written SIMD for
+the ML-KEM NTT specifically.
+
+\paragraph{Hardware counter profiling.}
+Bernstein and Schwabe~\cite{cachetime} discuss the relationship between cache
+behavior and cryptographic timing. PAPI~\cite{papi} provides a portable
+interface to hardware performance counters used in related profiling work.
+Phase~2 of this study will add PAPI counter collection to provide the
+mechanistic hardware-level explanation of the speedups observed here.
--- a/paper/sections/results.tex
+++ b/paper/sections/results.tex
@ -0,0 +1,181 @@
+% ── 4. Results ────────────────────────────────────────────────────────────────
+\section{Results}
+\label{sec:results}
+
+\subsection{Cycle Count Distributions}
+\label{sec:results:distributions}
+
+Figure~\ref{fig:distributions} shows the cycle count distributions for three
+representative operations in \mlkemk{512}, comparing \varref{} and \varavx{}.
+All distributions are right-skewed with a long tail from OS interrupts and
+cache-cold executions. The median (dashed lines) is robust to these outliers,
+justifying the nonparametric approach of §\ref{sec:meth:stats}.
+
+The separation between \varref{} and \varavx{} is qualitatively different
+across operation types: for \op{INVNTT} the distributions do not overlap at
+all (disjoint spikes separated by two orders of magnitude on the log scale);
+for \op{gen\_a} there is partial overlap; for noise sampling the distributions
+are nearly coincident.
+
+\begin{figure}[t]
+  \centering
+  \includegraphics[width=\columnwidth]{figures/distributions.pdf}
+  \caption{Cycle count distributions for three representative \mlkemk{512}
+           operations. Log $x$-axis. Dashed lines mark medians. Right-skew and
+           outlier structure motivate nonparametric statistics.}
+  \label{fig:distributions}
+\end{figure}
+
+\subsection{Speedup Decomposition}
+\label{sec:results:decomp}
+
+Figure~\ref{fig:decomp} shows the cumulative speedup at each optimization stage
+for all three \mlkem{} parameter sets. Each group of bars represents one
+operation; the three bars within a group show the total speedup achieved after
+applying (i)~O3 without auto-vec (\varrefnv{}), (ii)~O3 with auto-vec
+(\varref{}), and (iii)~hand-written AVX2 (\varavx{})---all normalized to the
+unoptimized \varrefo{} baseline. The log scale makes the three orders of
+magnitude of variation legible.
+
+Several structural features are immediately apparent:
+\begin{itemize}
+  \item The \varrefnv{} and \varref{} bars are nearly indistinguishable for
+        arithmetic operations (NTT, INVNTT, basemul, frommsg), confirming that
+        GCC's auto-vectorizer contributes negligibly to these operations.
+  \item The \varavx{} bars are 1--2 orders of magnitude taller than the
+        \varref{} bars for arithmetic operations, indicating that hand-written
+        SIMD dominates the speedup.
+  \item For SHAKE-heavy operations (gen\_a, noise), all three bars are much
+        closer together, reflecting the memory-bandwidth bottleneck that limits
+        SIMD benefit.
+\end{itemize}
+
+\begin{figure*}[t]
+  \centering
+  \input{figures/fig_decomp}
+  \caption{Cumulative speedup at each optimization stage, normalized to
+           \varrefo{} (1×). Three bars per operation:
+           \textcolor{colRefnv}{$\blacksquare$}~O3 no auto-vec,
+           \textcolor{colRef}{$\blacksquare$}~O3 + auto-vec,
+           \textcolor{colAvx}{$\blacksquare$}~O3 + hand SIMD (AVX2).
+           Log $y$-axis; 95\% bootstrap CI shown on \varavx{} bars.
+           Sorted by \varavx{} speedup.}
+  \label{fig:decomp}
+\end{figure*}
+
+\subsection{Hand-Written SIMD Speedup}
+\label{sec:results:simd}
+
+Figure~\ref{fig:handsimd} isolates the hand-written SIMD speedup (\varref{}
+$\to$ \varavx{}) across all three \mlkem{} parameter sets. Table~\ref{tab:simd}
+summarizes the numerical values.
+
+Key observations:
+\begin{itemize}
+  \item \textbf{Arithmetic operations} achieve the largest speedups:
+        \speedup{56.3} for \op{INVNTT} at \mlkemk{512}, \speedup{52.0} for
+        \op{basemul}, and \speedup{45.6} for \op{frommsg}. The 95\% bootstrap
+        CIs on these ratios are extremely tight (often $[\hat{s}, \hat{s}]$ to
+        two decimal places), reflecting near-perfect measurement stability.
+  \item \textbf{gen\_a} achieves \speedup{3.8}--\speedup{4.7}: substantially
+        smaller than arithmetic operations because SHAKE-128 generation is
+        memory-bandwidth limited.
+  \item \textbf{Noise sampling} achieves only \speedup{1.2}--\speedup{1.4},
+        the smallest SIMD benefit. The centered binomial distribution (CBD)
+        sampler is bit-manipulation-heavy with sequential bitstream reads that
+        do not parallelise well.
+  \item Speedups are broadly consistent across parameter sets for per-polynomial
+        operations, as expected (§\ref{sec:results:crossparams}).
+\end{itemize}
+
+\begin{figure*}[t]
+  \centering
+  \input{figures/fig_hand_simd}
+  \caption{Hand-written SIMD speedup (\varref{} $\to$ \varavx{}) per operation,
+           across all three \mlkem{} parameter sets. Log $y$-axis.
+           95\% bootstrap CI error bars (often sub-pixel).
+           Sorted by \mlkemk{512} speedup.}
+  \label{fig:handsimd}
+\end{figure*}
+
+\begin{table}[t]
+\caption{Hand-written SIMD speedup (\varref{} $\to$ \varavx{}), median ratio
+         with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$.}
+\label{tab:simd}
+\small
+\begin{tabular}{lccc}
+\toprule
+Operation & \mlkemk{512} & \mlkemk{768} & \mlkemk{1024} \\
+\midrule
+\op{INVNTT}  & $56.3\times$ & $52.2\times$ & $50.5\times$ \\
+\op{basemul} & $52.0\times$ & $47.6\times$ & $41.6\times$ \\
+\op{frommsg} & $45.6\times$ & $49.2\times$ & $55.4\times$ \\
+\op{NTT}     & $35.5\times$ & $39.4\times$ & $34.6\times$ \\
+\op{iDec}    & $35.1\times$ & $35.0\times$ & $31.1\times$ \\
+\op{iEnc}    & $10.0\times$ & $9.4\times$  & $9.4\times$  \\
+\op{iKeypair}& $8.3\times$  & $7.6\times$  & $8.1\times$  \\
+\op{gen\_a}  & $4.7\times$  & $3.8\times$  & $4.8\times$  \\
+\op{noise}   & $1.4\times$  & $1.4\times$  & $1.2\times$  \\
+\bottomrule
+\end{tabular}
+\end{table}
+
+\subsection{Statistical Significance}
+\label{sec:results:stats}
+
+All \varref{} vs.\ \varavx{} comparisons pass the Mann-Whitney U test at
+$p < 10^{-300}$. Cliff's $\delta = +1.000$ for all operations except
+\op{NTT} at \mlkemk{512} and \mlkemk{1024} ($\delta = +0.999$), meaning AVX2
+achieves a strictly smaller cycle count than \varref{} in effectively every
+observation pair.
+
+Figure~\ref{fig:cliffs} shows the heatmap of Cliff's $\delta$ values across
+all operations and parameter sets.
+
+\begin{figure}[t]
+  \centering
+  \includegraphics[width=\columnwidth]{figures/cliffs_delta_heatmap.pdf}
+  \caption{Cliff's $\delta$ (\varref{} vs.\ \varavx{}) for all operations and
+           parameter sets. $\delta = +1$: AVX2 is faster in every observation
+           pair. Nearly all cells are at $+1.000$.}
+  \label{fig:cliffs}
+\end{figure}
+
+\subsection{Cross-Parameter Consistency}
+\label{sec:results:crossparams}
+
+Figure~\ref{fig:crossparams} shows the \varavx{} speedup for the four
+per-polynomial operations across \mlkemk{512}, \mlkemk{768}, and
+\mlkemk{1024}. Since all three instantiations operate on 256-coefficient
+polynomials, speedups for \op{frommsg} and \op{INVNTT} should be
+parameter-independent. This holds approximately: frommsg varies by only
+$\pm{10\%}$, INVNTT by $\pm{6\%}$.
+
+\op{NTT} shows a more pronounced variation ($35.5\times$ at \mlkemk{512},
+$39.4\times$ at \mlkemk{768}, $34.6\times$ at \mlkemk{1024}) that is
+statistically real (non-overlapping 95\% CIs). We attribute this to
+\emph{cache state effects}: the surrounding polyvec loops that precede each
+NTT call have a footprint that varies with $k$, leaving different cache
+residency patterns that affect NTT latency in the scalar \varref{} path.
+The AVX2 path is less sensitive because its smaller register footprint keeps
+more state in vector registers.
+
+\begin{figure}[t]
+  \centering
+  \input{figures/fig_cross_param}
+  \caption{Per-polynomial operation speedup (\varref{} $\to$ \varavx{}) across
+           security parameters. Polynomial dimension is 256 for all; variation
+           reflects cache-state differences in the calling context.}
+  \label{fig:crossparams}
+\end{figure}
+
+\subsection{Hardware Counter Breakdown}
+\label{sec:results:papi}
+\phasetwo{IPC, L1/L2/L3 cache miss rates, branch mispredictions via PAPI.
+This section will contain bar charts of per-counter values comparing ref and
+avx2 for each operation, explaining the mechanistic origins of the speedup.}
+
+\subsection{Energy Efficiency}
+\label{sec:results:energy}
+\phasetwo{Intel RAPL pkg + DRAM energy readings per operation.
+EDP (energy-delay product) comparison. Energy per KEM operation.}
--- a/paper/sections/supplementary.tex
+++ b/paper/sections/supplementary.tex
@ -0,0 +1,31 @@
+% ── Supplementary: KEM-level end-to-end speedup ───────────────────────────────
+\section{End-to-End KEM Speedup}
+\label{sec:supp:kem}
+
+Figure~\ref{fig:kemlevel} shows the hand-written SIMD speedup for the
+top-level KEM operations: key generation (\op{kyber\_keypair}), encapsulation
+(\op{kyber\_encaps}), and decapsulation (\op{kyber\_decaps}). These composite
+operations aggregate the speedups of their constituent primitives, weighted by
+relative cycle counts.
+
+Decapsulation achieves the highest speedup (\speedup{6.9}--\speedup{7.1})
+because it involves the largest share of arithmetic operations (two additional
+NTT and INVNTT calls for re-encryption verification). Key generation achieves
+the lowest (\speedup{5.3}--\speedup{5.9}) because it involves one fewer
+polynomial multiplication step relative to encapsulation.
+
+\begin{figure}[h]
+  \centering
+  \input{figures/fig_kem_level}
+  \caption{End-to-end KEM speedup (\varref{} $\to$ \varavx{}) for
+           \op{kyber\_keypair}, \op{kyber\_encaps}, and \op{kyber\_decaps}.
+           Intel Xeon Platinum 8268; 95\% bootstrap CI.}
+  \label{fig:kemlevel}
+\end{figure}
+
+\section{Full Operation Set}
+\label{sec:supp:fullops}
+
+\todo[inline]{Full operation speedup table for all 20 benchmarked operations,
+including \op{poly\_compress}, \op{poly\_decompress}, \op{polyvec\_compress},
+\op{poly\_tomsg}, and the \texttt{*\_derand} KEM variants.}
--- a/slurm/build.sh
+++ b/slurm/build.sh
@ -0,0 +1,49 @@
+#!/bin/bash
+# Build all benchmark binaries on the HPC login node.
+#
+# Usage: bash slurm/build.sh [--papi] [--rapl]
+#
+# Run this once after rsyncing, before submitting jobs.
+# Binaries are written to harness/build-hpc/.
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+BUILD_DIR="${REPO_ROOT}/harness/build-hpc"
+WITH_PAPI=OFF
+WITH_RAPL=OFF
+
+for arg in "$@"; do
+  case "$arg" in
+    --papi) WITH_PAPI=ON ;;
+    --rapl) WITH_RAPL=ON ;;
+    *) echo "unknown flag: $arg" >&2; exit 1 ;;
+  esac
+done
+
+echo "=== pqc-bench build ==="
+echo "REPO_ROOT  : $REPO_ROOT"
+echo "BUILD_DIR  : $BUILD_DIR"
+echo "WITH_PAPI  : $WITH_PAPI"
+echo "WITH_RAPL  : $WITH_RAPL"
+echo "CC         : ${CC:-default}"
+echo "DATE       : $(date -Iseconds)"
+
+# Ensure submodule is populated.
+if [[ ! -f "${REPO_ROOT}/algorithms/kyber/ref/kem.c" ]]; then
+  echo "Populating git submodules..."
+  git -C "$REPO_ROOT" submodule update --init --recursive
+fi
+
+cmake \
+  -B "$BUILD_DIR" \
+  -S "${REPO_ROOT}/harness" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DWITH_PAPI="${WITH_PAPI}" \
+  -DWITH_RAPL="${WITH_RAPL}"
+
+cmake --build "$BUILD_DIR" --parallel
+
+echo ""
+echo "Built binaries:"
+ls -lh "${BUILD_DIR}"/bench_mlkem* 2>/dev/null || echo "(none found)"
--- a/slurm/submit.sh
+++ b/slurm/submit.sh
@ -0,0 +1,85 @@
+#!/bin/bash
+# Instantiate and submit SLURM benchmark jobs.
+#
+# Usage: bash slurm/submit.sh [--papi] [--nspins N] [--params LIST] [--variants LIST] [--node NODE]
+#
+# Examples:
+#   bash slurm/submit.sh
+#   bash slurm/submit.sh --papi --nspins 500
+#   bash slurm/submit.sh --variants "ref avx2" --params "512 1024"
+#   bash slurm/submit.sh --node node2334   # pin all jobs to a specific node
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+TEMPLATE="${REPO_ROOT}/slurm/templates/bench_mlkem.sh.tmpl"
+
+# ── Defaults ─────────────────────────────────────────────────────────────────
+NSPINS=1000
+WITH_PAPI=OFF
+PARAMS="512 768 1024"
+VARIANTS="ref avx2 refnv refo0"
+BENCH_NODE=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --papi)     WITH_PAPI=ON ;;
+    --nspins)   shift; NSPINS="$1" ;;
+    --params)   shift; PARAMS="$1" ;;
+    --variants) shift; VARIANTS="$1" ;;
+    --node)     shift; BENCH_NODE="$1" ;;
+    *) echo "unknown flag: $1" >&2; exit 1 ;;
+  esac
+  shift
+done
+
+# Build directory created by build.sh.
+BUILD_DIR="${REPO_ROOT}/harness/build-hpc"
+
+if [[ ! -d "$BUILD_DIR" ]]; then
+  echo "ERROR: $BUILD_DIR not found — run slurm/build.sh first" >&2
+  exit 1
+fi
+
+echo "=== pqc-bench submit ==="
+echo "NSPINS   : $NSPINS"
+echo "WITH_PAPI: $WITH_PAPI"
+echo "PARAMS   : $PARAMS"
+echo "VARIANTS : $VARIANTS"
+echo "NODE     : ${BENCH_NODE:-any}"
+echo ""
+
+JOBS_SUBMITTED=0
+
+for PARAM in $PARAMS; do
+  for VARIANT in $VARIANTS; do
+    BINARY="${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
+    if [[ ! -x "$BINARY" ]]; then
+      echo "SKIP  bench_mlkem${PARAM}_${VARIANT} — binary not found"
+      continue
+    fi
+
+    # Output goes into data/raw/kyber/mlkem{PARAM}/{VARIANT}/ so the aggregation
+    # tool infers algorithm and variant from the directory structure.
+    OUTPUT_DIR="${REPO_ROOT}/data/raw/kyber/mlkem${PARAM}/${VARIANT}"
+    mkdir -p "$OUTPUT_DIR"
+
+    # Instantiate template.
+    JOB_SCRIPT="$(mktemp /tmp/bench_mlkem${PARAM}_${VARIANT}.XXXXXX.sh)"
+    export PARAM VARIANT NSPINS BUILD_DIR OUTPUT_DIR WITH_PAPI BENCH_NODE
+    envsubst '${PARAM} ${VARIANT} ${NSPINS} ${BUILD_DIR} ${OUTPUT_DIR} ${WITH_PAPI} ${BENCH_NODE}' \
+      < "$TEMPLATE" > "$JOB_SCRIPT"
+    chmod +x "$JOB_SCRIPT"
+
+    SBATCH_ARGS="--parsable"
+    if [[ -n "$BENCH_NODE" ]]; then
+      SBATCH_ARGS="$SBATCH_ARGS --nodelist=$BENCH_NODE"
+    fi
+    JOB_ID=$(sbatch $SBATCH_ARGS "$JOB_SCRIPT")
+    echo "SUBMIT bench_mlkem${PARAM}_${VARIANT}  job=${JOB_ID}  out=${OUTPUT_DIR}/${JOB_ID}.out"
+    JOBS_SUBMITTED=$((JOBS_SUBMITTED + 1))
+  done
+done
+
+echo ""
+echo "Submitted $JOBS_SUBMITTED jobs."
--- a/slurm/templates/bench_mlkem.sh.tmpl
+++ b/slurm/templates/bench_mlkem.sh.tmpl
@ -1,38 +1,48 @@
 #!/bin/bash
-# Template SLURM job for ML-KEM benchmarking.
-# Variables filled in by slurm/submit.sh:
-#   PARAM    — 512 | 768 | 1024
-#   VARIANT  — ref | refnv | avx2 | ...
-#   NTESTS   — iterations per operation (default 10000)
-#   BINARY   — path to compiled benchmark binary
+# SLURM job template for ML-KEM benchmarking.
+# Instantiated by slurm/submit.sh — do not submit directly.
+#
+# Template variables (filled by envsubst in submit.sh):
+#   PARAM      — 512 | 768 | 1024
+#   VARIANT    — ref | avx2 | refnv | refo0
+#   NSPINS     — outer loop iterations (default 1000)
+#   BUILD_DIR  — path to directory containing the benchmark binaries
+#   OUTPUT_DIR — directory where this job's .out file is written

 #SBATCH -J bench_mlkem${PARAM}_${VARIANT}
 #SBATCH -p batch
 #SBATCH -n 1
-#SBATCH --mem=2G
-#SBATCH -t 02:00:00
-#SBATCH --constraint=intel
-#SBATCH -o %j_mlkem${PARAM}_${VARIANT}.out
+#SBATCH -c 1
+#SBATCH --mem=256M
+#SBATCH -t 00:45:00
+#SBATCH -o ${OUTPUT_DIR}/%j.out

-# Pin to a single core, disable frequency scaling for deterministic measurements.
-# Requires appropriate OSCAR allocation; skip if unavailable.
-export GOMP_CPU_AFFINITY="0"
+# ── Environment ──────────────────────────────────────────────────────────────
+# Pin to a single logical core for deterministic measurements.
+taskset -cp 0 $$ 2>/dev/null || true

-NTESTS=${NTESTS:-10000}
-BINARY=${BINARY:-./bench_mlkem${PARAM}_${VARIANT}}
+# Disable CPU frequency scaling if we have permission; ignore otherwise.
+cpupower frequency-set -g performance 2>/dev/null || true
+
+# ── Metadata (parsed by analysis/pkg/parse) ──────────────────────────────────
+# These ## lines are picked up by the parser alongside the OSCAR prolog lines.
+echo "## BENCH_VARIANT  : ${VARIANT}"
+echo "## BENCH_PARAM    : ${PARAM}"
+echo "## BENCH_NSPINS   : ${NSPINS}"
+echo "## BENCH_NODE_REQ : ${BENCH_NODE}"
+echo "## BENCH_BINARY   : ${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
+echo "## BENCH_DATE     : $(date -Iseconds)"
+echo "## CPU_MODEL      : $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)"
+echo "## PERF_PARANOID  : $(cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo unknown)"
+echo "## PAPI_BUILD     : ${WITH_PAPI:-OFF}"
+
+BINARY="${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
+NSPINS="${NSPINS:-1000}"

 if [[ ! -x "$BINARY" ]]; then
  echo "ERROR: binary not found or not executable: $BINARY" >&2
  exit 1
 fi

-echo "=== bench_mlkem${PARAM}_${VARIANT} ==="
-echo "SLURM_JOB_ID:  $SLURM_JOB_ID"
-echo "SLURM_NODELIST: $SLURM_NODELIST"
-echo "NTESTS:        $NTESTS"
-echo "DATE:          $(date -Iseconds)"
-echo "UNAME:         $(uname -a)"
-echo "CPU:           $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)"
-echo "---"
-
-"$BINARY" "$NTESTS"
+# ── Run ───────────────────────────────────────────────────────────────────────
+"$BINARY" "$NSPINS"
				`@ -0,0 +1 @@`
				`Subproject commit 4768bd37c02f9c40a46cb49d4d1f4d5e612bb882`
				`@ -0,0 +1 @@`
				`# This file is generated by cmake for dependency checking of the CMakeCache.txt file`