This commit is contained in:
Levi Neuwirth 2026-04-05 11:23:21 -04:00
parent 7750ae3d8c
commit 00ced380f9
92 changed files with 2681844 additions and 260 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "algorithms/kyber"]
path = algorithms/kyber
url = https://github.com/pq-crystals/kyber

1
algorithms/kyber Submodule

@ -0,0 +1 @@
Subproject commit 4768bd37c02f9c40a46cb49d4d1f4d5e612bb882

286
analysis/analyze.py Normal file
View File

@ -0,0 +1,286 @@
#!/usr/bin/env python3
"""Statistical analysis of pqc-bench results.
Parses .out files via the Go aggregator, then computes a three-way
decomposition of where speedup originates:
refo0 refnv compiler optimisation (O3, no vectorisation)
refnv ref compiler auto-vectorisation
ref avx2 hand-written SIMD
Usage:
# Run aggregator inline:
python3 analysis/analyze.py --data data/raw/kyber
# Or pre-generate the raw JSON once, then reuse it:
go run ./analysis/cmd/aggregate --raw --out /tmp/bench.json data/raw/kyber
python3 analysis/analyze.py --json /tmp/bench.json
# Write JSON output for figure generation:
python3 analysis/analyze.py --data data/raw/kyber --out analysis/results.json
"""
import argparse
import json
import subprocess
import sys
from pathlib import Path
import numpy as np
from scipy import stats as scipy_stats
# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------
REPO_ROOT = Path(__file__).resolve().parent.parent
def load_json(path: str) -> list[dict]:
with open(path) as f:
return json.load(f)
def run_aggregator(data_dir: str) -> list[dict]:
"""Run the Go aggregator and return parsed records."""
cmd = ["go", "run", "./cmd/aggregate", "--raw", data_dir]
result = subprocess.run(cmd, capture_output=True, text=True, cwd=REPO_ROOT / "analysis")
if result.returncode != 0:
print(f"aggregator failed:\n{result.stderr}", file=sys.stderr)
sys.exit(1)
return json.loads(result.stdout)
# ---------------------------------------------------------------------------
# Statistics
# ---------------------------------------------------------------------------
def cliffs_delta_from_u(u: float, m: int, n: int) -> float:
"""Cliff's delta derived from Mann-Whitney U statistic.
U = number of pairs (faster_i, baseline_j) where faster_i < baseline_j.
delta = (2U - m*n) / (m*n) [-1, +1]
Positive faster dominates baseline.
"""
return (2 * u - m * n) / (m * n)
def bootstrap_speedup_ci(
baseline: np.ndarray,
faster: np.ndarray,
n_boot: int = 5_000,
ci: float = 0.95,
rng: np.random.Generator | None = None,
) -> tuple[float, float]:
"""95% bootstrap CI for speedup = median(baseline) / median(faster).
Resamples both arrays independently using vectorised indexing; returns (lo, hi).
"""
if rng is None:
rng = np.random.default_rng(42)
m, n = len(baseline), len(faster)
# Draw all indices at once: shape (n_boot, m) and (n_boot, n)
bi = rng.integers(0, m, size=(n_boot, m))
fi = rng.integers(0, n, size=(n_boot, n))
b_samples = baseline[bi] # (n_boot, m)
f_samples = faster[fi] # (n_boot, n)
# Median along axis=1 for each bootstrap replicate
ratios = np.median(b_samples, axis=1) / np.median(f_samples, axis=1)
alpha = (1 - ci) / 2
return float(np.percentile(ratios, alpha * 100)), float(np.percentile(ratios, (1 - alpha) * 100))
def compare(baseline: np.ndarray, faster: np.ndarray, rng: np.random.Generator) -> dict:
"""Full pairwise comparison: speedup + CI + Mann-Whitney + Cliff's delta."""
speedup = float(np.median(baseline)) / float(np.median(faster))
ci_lo, ci_hi = bootstrap_speedup_ci(baseline, faster, rng=rng)
# One-sided Mann-Whitney: is faster < baseline in cycle counts?
m, n = len(faster), len(baseline)
u_stat, p_val = scipy_stats.mannwhitneyu(faster, baseline, alternative="less")
# Cliff's delta derived from U — O(n log n), same cost as Mann-Whitney
delta = cliffs_delta_from_u(float(u_stat), m, n)
return {
"speedup": speedup,
"ci95": [ci_lo, ci_hi],
"mannwhitney_p": float(p_val),
"cliffs_delta": delta,
"n_baseline": n,
"n_faster": m,
}
# ---------------------------------------------------------------------------
# Analysis
# ---------------------------------------------------------------------------
VARIANTS = ("refo0", "refnv", "ref", "avx2")
# Canonical operation order for display
OP_ORDER = [
"NTT", "INVNTT", "basemul", "frommsg",
"gen_a", "poly_getnoise_eta1", "poly_getnoise_eta2",
"keygen", "enc", "dec",
]
def analyze(records: list[dict]) -> list[dict]:
# Build lookup: (algorithm, variant, operation) → raw array
raw: dict[tuple[str, str, str], np.ndarray] = {}
for r in records:
if r.get("raw"):
raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64)
# Collect all (algorithm, operation) pairs present across all variants
alg_ops = sorted(
{(alg, op) for alg, var, op in raw},
key=lambda x: (x[0], _op_rank(x[1])),
)
rng = np.random.default_rng(42)
results = []
for alg, op in alg_ops:
arrays = {v: raw[(alg, v, op)] for v in VARIANTS if (alg, v, op) in raw}
if len(arrays) < 2:
continue
row: dict = {
"algorithm": alg,
"operation": op,
"medians": {v: float(np.median(a)) for v, a in arrays.items()},
"n_obs": {v: len(a) for v, a in arrays.items()},
"comparisons": {},
}
comps = row["comparisons"]
# Three-way decomposition (each step requires both variants present)
if "refo0" in arrays and "refnv" in arrays:
comps["refo0_to_refnv"] = compare(arrays["refo0"], arrays["refnv"], rng)
if "refnv" in arrays and "ref" in arrays:
comps["refnv_to_ref"] = compare(arrays["refnv"], arrays["ref"], rng)
if "ref" in arrays and "avx2" in arrays:
comps["ref_to_avx2"] = compare(arrays["ref"], arrays["avx2"], rng)
# Totals
if "refo0" in arrays and "ref" in arrays:
comps["refo0_to_ref"] = compare(arrays["refo0"], arrays["ref"], rng)
if "refo0" in arrays and "avx2" in arrays:
comps["refo0_to_avx2"] = compare(arrays["refo0"], arrays["avx2"], rng)
results.append(row)
return results
def _op_rank(op: str) -> int:
try:
return OP_ORDER.index(op)
except ValueError:
return len(OP_ORDER)
# ---------------------------------------------------------------------------
# Display
# ---------------------------------------------------------------------------
def _fmt_speedup(comp: dict | None) -> str:
if comp is None:
return ""
r = comp["speedup"]
lo, hi = comp["ci95"]
return f"{r:5.2f}x [{lo:.2f},{hi:.2f}]"
def _fmt_delta(comp: dict | None) -> str:
if comp is None:
return ""
return f"{comp['cliffs_delta']:+.3f}"
def _fmt_p(comp: dict | None) -> str:
if comp is None:
return ""
p = comp["mannwhitney_p"]
if p < 1e-300:
return " <1e-300"
if p < 1e-10:
return f" {p:.1e}"
return f" {p:.4f}"
def print_table(results: list[dict]) -> None:
algs = sorted({r["algorithm"] for r in results})
for alg in algs:
rows = [r for r in results if r["algorithm"] == alg]
rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0))
print(f"\n{''*110}")
print(f" {alg.upper()}")
print(f"{''*110}")
print(
f" {'Operation':<24}"
f" {'O3 (no-vec)':>18}" # refo0→refnv
f" {'Auto-vec':>18}" # refnv→ref
f" {'Hand SIMD':>18}" # ref→avx2
f" {'Total':>18}" # refo0→avx2
f" {'Cliff δ':>7}"
f" {'p-value':>9}"
)
print(f" {'':─<24} {'':─<18} {'':─<18} {'':─<18} {'':─<18} {'':─<7} {'':─<9}")
for r in rows:
c = r["comparisons"]
print(
f" {r['operation']:<24}"
f" {_fmt_speedup(c.get('refo0_to_refnv')):>18}"
f" {_fmt_speedup(c.get('refnv_to_ref')):>18}"
f" {_fmt_speedup(c.get('ref_to_avx2')):>18}"
f" {_fmt_speedup(c.get('refo0_to_avx2')):>18}"
f" {_fmt_delta(c.get('ref_to_avx2')):>7}"
f" {_fmt_p(c.get('ref_to_avx2')):>9}"
)
print(f"\n{''*110}")
print(" Speedup = median(baseline) / median(variant); CI: 95% bootstrap (5000 iterations)")
print(" Cliff δ and p-value are for ref → avx2 comparison (H1: avx2 cycles < ref cycles)")
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(description="Statistical analysis of pqc-bench results")
src = parser.add_mutually_exclusive_group(required=True)
src.add_argument("--data", metavar="DIR", help="data directory (runs Go aggregator)")
src.add_argument("--json", metavar="FILE", help="pre-generated aggregate JSON with --raw")
parser.add_argument("--out", metavar="FILE", help="write analysis JSON to this file")
args = parser.parse_args()
if args.json:
records = load_json(args.json)
print(f"Loaded {len(records)} groups from {args.json}.", file=sys.stderr)
else:
print("Running aggregator...", file=sys.stderr)
records = run_aggregator(args.data)
print(f"Loaded {len(records)} groups.", file=sys.stderr)
results = analyze(records)
print_table(results)
if args.out:
with open(args.out, "w") as f:
json.dump(results, f, indent=2)
print(f"\nWrote analysis JSON to {args.out}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,215 @@
// aggregate parses pqc-bench .out files and emits summary statistics as JSON.
//
// Usage:
//
// aggregate [--raw] [--out results.json] <data-dir>
//
// It walks <data-dir> for all *.out files, grouping results by the parent
// directory name (algorithm) and the variant inferred from the SLURM header.
// Output is a JSON array of result objects, one per (algorithm, variant,
// operation) triple.
package main
import (
"encoding/json"
"flag"
"fmt"
"io/fs"
"os"
"path/filepath"
"slices"
"strings"
"git.levineuwirth.org/neuwirth/where-simd-helps/analysis/pkg/parse"
"git.levineuwirth.org/neuwirth/where-simd-helps/analysis/pkg/stats"
)
// Result is one output record: all statistics for a single
// (algorithm, variant, operation) group.
type Result struct {
Algorithm string `json:"algorithm"`
Variant string `json:"variant"`
Operation string `json:"operation"`
Unit string `json:"unit"`
NObservations int `json:"n_observations"`
NRuns int `json:"n_runs"`
Median float64 `json:"median"`
Mean float64 `json:"mean"`
Std float64 `json:"std"`
MAD float64 `json:"mad"`
P5 float64 `json:"p5"`
P25 float64 `json:"p25"`
P75 float64 `json:"p75"`
P95 float64 `json:"p95"`
P99 float64 `json:"p99"`
CI95 [2]float64 `json:"ci95"`
Node string `json:"node"`
Sources []string `json:"sources"`
Raw []int64 `json:"raw,omitempty"`
}
// groupKey uniquely identifies a (algorithm, variant, operation) combination.
type groupKey struct {
algorithm, variant, operation string
}
func main() {
rawFlag := flag.Bool("raw", false, "include per-observation cycle counts in output")
outFlag := flag.String("out", "", "write JSON output to this file instead of stdout")
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: aggregate [--raw] [--out FILE] <data-dir>\n")
flag.PrintDefaults()
}
flag.Parse()
if flag.NArg() != 1 {
flag.Usage()
os.Exit(1)
}
dataDir := flag.Arg(0)
// Collect all .out files.
var outFiles []string
err := filepath.WalkDir(dataDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && strings.HasSuffix(path, ".out") {
outFiles = append(outFiles, path)
}
return nil
})
if err != nil {
fmt.Fprintf(os.Stderr, "error walking %s: %v\n", dataDir, err)
os.Exit(1)
}
if len(outFiles) == 0 {
fmt.Fprintf(os.Stderr, "no .out files found under %s\n", dataDir)
os.Exit(1)
}
// Parse every file and accumulate observations per group.
type accumulator struct {
values []int64
sources []string
node string
}
groups := make(map[groupKey]*accumulator)
for _, path := range outFiles {
run, err := parse.ParseFile(path)
if err != nil {
fmt.Fprintf(os.Stderr, "warning: skipping %s: %v\n", path, err)
continue
}
algorithm := inferAlgorithm(run.Meta, path)
variant := parse.InferVariant(run.Meta)
for _, spin := range run.Spins {
for op, m := range spin {
key := groupKey{algorithm, variant, op}
acc := groups[key]
if acc == nil {
acc = &accumulator{node: run.Meta.Node}
groups[key] = acc
}
acc.values = append(acc.values, m.Median)
}
}
// Record sources per group (any key with this algorithm+variant).
for key, acc := range groups {
if key.algorithm == algorithm && key.variant == variant {
if !slices.Contains(acc.sources, path) {
acc.sources = append(acc.sources, path)
}
}
}
}
// Build results.
results := make([]Result, 0, len(groups))
for key, acc := range groups {
sorted := make([]int64, len(acc.values))
copy(sorted, acc.values)
stats.SortInt64(sorted)
s := stats.Compute(sorted)
r := Result{
Algorithm: key.algorithm,
Variant: key.variant,
Operation: key.operation,
Unit: "cycles",
NObservations: s.N,
NRuns: len(acc.sources),
Median: s.Median,
Mean: s.Mean,
Std: s.Std,
MAD: s.MAD,
P5: s.P5,
P25: s.P25,
P75: s.P75,
P95: s.P95,
P99: s.P99,
CI95: s.CI95,
Node: acc.node,
Sources: acc.sources,
}
if *rawFlag {
r.Raw = acc.values
}
results = append(results, r)
}
// Sort for stable output: algorithm → variant → operation.
slices.SortFunc(results, func(a, b Result) int {
if a.Algorithm != b.Algorithm {
return strings.Compare(a.Algorithm, b.Algorithm)
}
if a.Variant != b.Variant {
return strings.Compare(a.Variant, b.Variant)
}
return strings.Compare(a.Operation, b.Operation)
})
out, err := json.MarshalIndent(results, "", " ")
if err != nil {
fmt.Fprintf(os.Stderr, "error marshalling JSON: %v\n", err)
os.Exit(1)
}
if *outFlag != "" {
if err := os.WriteFile(*outFlag, out, 0o644); err != nil {
fmt.Fprintf(os.Stderr, "error writing %s: %v\n", *outFlag, err)
os.Exit(1)
}
fmt.Fprintf(os.Stderr, "wrote %d results to %s\n", len(results), *outFlag)
} else {
fmt.Println(string(out))
}
}
// inferAlgorithm returns the algorithm name (e.g. "mlkem512") for a run.
//
// Priority:
// 1. BENCH_PARAM metadata → "mlkem{PARAM}" (new-style runs via submit.sh)
// 2. Walk the file path upward for a segment matching "mlkem\d+" (handles
// both flat old-style layout and new nested layout transparently)
// 3. The immediate parent directory name as a last resort.
func inferAlgorithm(meta parse.Meta, filePath string) string {
if meta.BenchParam != "" {
return "mlkem" + meta.BenchParam
}
// Walk path components looking for mlkem\d+.
dir := filepath.Dir(filePath)
for dir != "." && dir != "/" {
base := filepath.Base(dir)
if strings.HasPrefix(base, "mlkem") {
return base
}
dir = filepath.Dir(dir)
}
return filepath.Base(filepath.Dir(filePath))
}

View File

@ -0,0 +1,242 @@
// analyze-simd computes speedup ratios from aggregated pqc-bench results.
//
// Usage:
//
// analyze-simd [--baseline ref] [--in results.json] [--out speedups.json]
//
// It reads the JSON produced by 'aggregate', computes per-operation speedups
// relative to the baseline variant, and emits both a human-readable table
// and a structured JSON file suitable for downstream plotting.
package main
import (
"cmp"
"encoding/json"
"flag"
"fmt"
"math"
"os"
"slices"
"strings"
"text/tabwriter"
)
// Record mirrors the aggregate output schema (fields we need).
type Record struct {
Algorithm string `json:"algorithm"`
Variant string `json:"variant"`
Operation string `json:"operation"`
Median float64 `json:"median"`
CI95 [2]float64 `json:"ci95"`
NRuns int `json:"n_runs"`
}
// Speedup is one variant-vs-baseline comparison for a single (algorithm, operation).
type Speedup struct {
Variant string `json:"variant"`
Median float64 `json:"median"`
Speedup float64 `json:"speedup"`
SpeedupCI [2]float64 `json:"speedup_ci95"`
}
// Result is one output row: all comparisons for one (algorithm, operation) pair.
type Result struct {
Algorithm string `json:"algorithm"`
Operation string `json:"operation"`
BaselineVariant string `json:"baseline_variant"`
BaselineMedian float64 `json:"baseline_median"`
BaselineCI95 [2]float64 `json:"baseline_ci95"`
Comparisons []Speedup `json:"comparisons"`
}
func main() {
baseline := flag.String("baseline", "ref", "variant to use as the speedup denominator")
inFile := flag.String("in", "results/kyber.json", "input JSON from aggregate")
outFile := flag.String("out", "", "write speedup JSON to this file (default: stdout)")
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: analyze-simd [--baseline VARIANT] [--in FILE] [--out FILE]\n")
flag.PrintDefaults()
}
flag.Parse()
raw, err := os.ReadFile(*inFile)
if err != nil {
fmt.Fprintf(os.Stderr, "error reading %s: %v\n", *inFile, err)
os.Exit(1)
}
var records []Record
if err := json.Unmarshal(raw, &records); err != nil {
fmt.Fprintf(os.Stderr, "error parsing JSON: %v\n", err)
os.Exit(1)
}
// Index by (algorithm, variant, operation).
type key struct{ algorithm, variant, operation string }
idx := make(map[key]Record, len(records))
for _, r := range records {
idx[key{r.Algorithm, r.Variant, r.Operation}] = r
}
// Collect sorted unique values for stable output.
algorithms := unique(records, func(r Record) string { return r.Algorithm })
operations := unique(records, func(r Record) string { return r.Operation })
variants := unique(records, func(r Record) string { return r.Variant })
// Remove baseline from comparison variants.
variants = slices.DeleteFunc(variants, func(v string) bool { return v == *baseline })
// Build results.
var results []Result
for _, alg := range algorithms {
for _, op := range operations {
baseRec, ok := idx[key{alg, *baseline, op}]
if !ok || baseRec.Median == 0 {
continue
}
res := Result{
Algorithm: alg,
Operation: op,
BaselineVariant: *baseline,
BaselineMedian: baseRec.Median,
BaselineCI95: baseRec.CI95,
}
for _, v := range variants {
cmpRec, ok := idx[key{alg, v, op}]
if !ok || cmpRec.Median == 0 {
continue
}
sp := baseRec.Median / cmpRec.Median
// Conservative CI: ratio of interval bounds.
// speedup_lo = baseline_lo / cmp_hi
// speedup_hi = baseline_hi / cmp_lo
var spCI [2]float64
if cmpRec.CI95[1] > 0 {
spCI[0] = safeDiv(baseRec.CI95[0], cmpRec.CI95[1])
}
if cmpRec.CI95[0] > 0 {
spCI[1] = safeDiv(baseRec.CI95[1], cmpRec.CI95[0])
}
res.Comparisons = append(res.Comparisons, Speedup{
Variant: v,
Median: cmpRec.Median,
Speedup: sp,
SpeedupCI: spCI,
})
}
if len(res.Comparisons) > 0 {
results = append(results, res)
}
}
}
// Print human-readable table to stderr.
printTable(os.Stderr, results, variants, *baseline)
// Emit JSON.
out, err := json.MarshalIndent(results, "", " ")
if err != nil {
fmt.Fprintf(os.Stderr, "error marshalling JSON: %v\n", err)
os.Exit(1)
}
if *outFile != "" {
if err := os.WriteFile(*outFile, out, 0o644); err != nil {
fmt.Fprintf(os.Stderr, "error writing %s: %v\n", *outFile, err)
os.Exit(1)
}
fmt.Fprintf(os.Stderr, "wrote %d results to %s\n", len(results), *outFile)
} else {
fmt.Println(string(out))
}
}
func printTable(w *os.File, results []Result, variants []string, baseline string) {
tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0)
// Group by algorithm.
byAlg := make(map[string][]Result)
for _, r := range results {
byAlg[r.Algorithm] = append(byAlg[r.Algorithm], r)
}
algs := make([]string, 0, len(byAlg))
for a := range byAlg {
algs = append(algs, a)
}
slices.Sort(algs)
for _, alg := range algs {
fmt.Fprintf(tw, "\n── %s (baseline: %s) ──\n", strings.ToUpper(alg), baseline)
// Header.
var hdr strings.Builder
fmt.Fprintf(&hdr, "%-38s\t%12s", "operation", baseline+"(cycles)")
for _, v := range variants {
fmt.Fprintf(&hdr, "\t%10s", v)
}
fmt.Fprintln(tw, hdr.String())
fmt.Fprintln(tw, strings.Repeat("-", 38+13+11*len(variants)))
rows := byAlg[alg]
slices.SortFunc(rows, func(a, b Result) int {
// Sort by descending avx2 speedup if available, else alphabetically.
sa := speedupFor(a, "avx2")
sb := speedupFor(b, "avx2")
if sa != sb {
return cmp.Compare(sb, sa) // descending
}
return strings.Compare(a.Operation, b.Operation)
})
for _, r := range rows {
var line strings.Builder
fmt.Fprintf(&line, "%-38s\t%12s", r.Operation, formatCycles(r.BaselineMedian))
for _, v := range variants {
sp := speedupFor(r, v)
if math.IsNaN(sp) {
fmt.Fprintf(&line, "\t%10s", "---")
} else {
fmt.Fprintf(&line, "\t%9.2fx", sp)
}
}
fmt.Fprintln(tw, line.String())
}
}
tw.Flush()
}
func speedupFor(r Result, variant string) float64 {
for _, c := range r.Comparisons {
if c.Variant == variant {
return c.Speedup
}
}
return math.NaN()
}
func formatCycles(c float64) string {
if c >= 1_000_000 {
return fmt.Sprintf("%.2fM", c/1_000_000)
}
if c >= 1_000 {
return fmt.Sprintf("%.1fK", c/1_000)
}
return fmt.Sprintf("%.0f", c)
}
func safeDiv(a, b float64) float64 {
if b == 0 {
return 0
}
return a / b
}
func unique(records []Record, fn func(Record) string) []string {
seen := make(map[string]struct{})
for _, r := range records {
seen[fn(r)] = struct{}{}
}
out := make([]string, 0, len(seen))
for k := range seen {
out = append(out, k)
}
slices.Sort(out)
return out
}

487
analysis/figures.py Normal file
View File

@ -0,0 +1,487 @@
#!/usr/bin/env python3
"""Matplotlib draft figures for the PQC SIMD speedup analysis.
Usage:
python3 analysis/figures.py [--json analysis/results.json] [--out figures/]
"""
import argparse
import json
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
# Cumulative stages used in Figure 1 (each shows total speedup from refo0)
STAGE_KEYS = ["refo0_to_refnv", "refo0_to_ref", "refo0_to_avx2"]
STAGE_LABELS = ["O3, no auto-vec", "O3 + auto-vec", "O3 + hand SIMD (avx2)"]
STAGE_COLORS = ["#4C72B0", "#55A868", "#C44E52"]
# Ops to show in the primary figures (excludes top-level KEM wrappers)
PRIMARY_OPS = {
"poly_frommsg", "INVNTT", "polyvec_basemul_acc_montgomery", "NTT",
"indcpa_dec", "polyvec_decompress", "poly_decompress",
"poly_compress", "poly_tomsg", "polyvec_compress",
"indcpa_enc", "indcpa_keypair", "gen_a",
"poly_getnoise_eta1", "poly_getnoise_eta2",
}
# Short display names
OP_SHORT = {
"poly_frommsg": "frommsg",
"INVNTT": "INVNTT",
"polyvec_basemul_acc_montgomery": "basemul",
"NTT": "NTT",
"indcpa_dec": "dec",
"polyvec_decompress": "pvec_decomp",
"poly_decompress": "poly_decomp",
"poly_compress": "poly_comp",
"poly_tomsg": "tomsg",
"polyvec_compress": "pvec_comp",
"indcpa_enc": "enc",
"indcpa_keypair": "keypair",
"gen_a": "gen_a",
"poly_getnoise_eta1": "noise_η₁",
"poly_getnoise_eta2": "noise_η₂",
}
ALGORITHMS = ["mlkem512", "mlkem768", "mlkem1024"]
ALG_TITLES = {"mlkem512": "ML-KEM-512", "mlkem768": "ML-KEM-768", "mlkem1024": "ML-KEM-1024"}
# Operations selected to illustrate the distribution figure:
# one high-speedup arithmetic op, one medium SHAKE-bound op, one low-speedup op
DIST_OPS = [
("INVNTT", "INVNTT\n(~55× speedup)"),
("gen_a", "gen_a\n(~4× speedup)"),
("poly_getnoise_eta1","noise η₁\n(~1.3× speedup)"),
]
# Per-polynomial ops whose speedup should be param-independent
CROSS_PARAM_OPS = [
"poly_frommsg",
"INVNTT",
"polyvec_basemul_acc_montgomery",
"NTT",
]
# KEM-level ops for supplementary
KEM_OPS = ["kyber_keypair", "kyber_encaps", "kyber_decaps"]
KEM_SHORT = {"kyber_keypair": "KeyGen", "kyber_encaps": "Encaps", "kyber_decaps": "Decaps"}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def load(json_path: str) -> list[dict]:
with open(json_path) as f:
return json.load(f)
def ops_for_alg(results: list[dict], alg: str) -> list[dict]:
rows = [r for r in results if r["algorithm"] == alg and r["operation"] in PRIMARY_OPS]
rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0))
return rows
# ---------------------------------------------------------------------------
# Figure 1: cumulative grouped bars — speedup at each optimisation stage
#
# Each group shows three bars for one operation:
# refo0→refnv total speedup with O3, auto-vec OFF
# refo0→ref total speedup with O3, auto-vec ON
# refo0→avx2 total speedup with O3 + hand-written SIMD
#
# Because all bars share the same baseline (refo0=1), they are directly
# comparable without any additive/multiplicative ambiguity.
# ---------------------------------------------------------------------------
def fig_decomposition(results: list[dict], out_dir: Path) -> None:
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=False)
for ax, alg in zip(axes, ALGORITHMS):
rows = ops_for_alg(results, alg)
if not rows:
ax.set_visible(False)
continue
ops = [OP_SHORT.get(r["operation"], r["operation"]) for r in rows]
n = len(rows)
group = np.arange(n)
# Three bars per group, evenly spaced within each group slot
bar_w = 0.22
offsets = np.array([-bar_w, 0, bar_w])
for (key, label, color), offset in zip(
zip(STAGE_KEYS, STAGE_LABELS, STAGE_COLORS), offsets
):
vals = np.array([r["comparisons"].get(key, {}).get("speedup", 0.0) for r in rows])
ci_lo = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[0] for r in rows])
ci_hi = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[1] for r in rows])
yerr = np.array([vals - ci_lo, ci_hi - vals])
mask = vals > 0
ax.bar(group[mask] + offset, vals[mask], bar_w,
label=label, color=color, alpha=0.88, zorder=3)
ax.errorbar(group[mask] + offset, vals[mask], yerr=yerr[:, mask],
fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4)
ax.set_yscale("log")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×"))
ax.set_title(ALG_TITLES[alg], fontsize=12, fontweight="bold")
ax.set_xticks(group)
ax.set_xticklabels(ops, rotation=45, ha="right", fontsize=8)
ax.set_ylabel("Speedup over -O0 (×, log scale)" if alg == "mlkem512" else "")
ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
ax.set_axisbelow(True)
ax.set_xlim(-0.5, n - 0.5)
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper center", ncol=3,
fontsize=10, frameon=True, bbox_to_anchor=(0.5, 1.02))
fig.suptitle(
"ML-KEM Cumulative Speedup at Each Optimisation Stage "
"(Intel Xeon Platinum 8268, 95% bootstrap CI)",
fontsize=11, y=1.06,
)
fig.tight_layout()
_save(fig, out_dir, "decomposition")
# ---------------------------------------------------------------------------
# Figure 2: hand-SIMD speedup (ref→avx2), all algorithms overlaid, log scale
# ---------------------------------------------------------------------------
def fig_hand_simd(results: list[dict], out_dir: Path) -> None:
all_ops: dict[str, dict] = {}
for r in results:
if r["operation"] in PRIMARY_OPS and "ref_to_avx2" in r["comparisons"]:
all_ops.setdefault(r["operation"], {})
all_ops[r["operation"]][r["algorithm"]] = r["comparisons"]["ref_to_avx2"]
ops_sorted = sorted(
all_ops,
key=lambda op: -all_ops[op].get("mlkem512", {}).get("speedup", 0),
)
short_ops = [OP_SHORT.get(op, op) for op in ops_sorted]
x = np.arange(len(ops_sorted))
bar_w = 0.25
offsets = [-bar_w, 0, bar_w]
colors = ["#4C72B0", "#55A868", "#C44E52"]
fig, ax = plt.subplots(figsize=(14, 5))
for alg, offset, color in zip(ALGORITHMS, offsets, colors):
vals = np.array([all_ops[op].get(alg, {}).get("speedup", 0) for op in ops_sorted])
ci_lo = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[0] for op in ops_sorted])
ci_hi = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[1] for op in ops_sorted])
yerr = np.array([vals - ci_lo, ci_hi - vals])
mask = vals > 0
ax.bar(x[mask] + offset, vals[mask], bar_w,
label=ALG_TITLES[alg], color=color, alpha=0.85, zorder=3)
ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4)
ax.set_yscale("log")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×"))
ax.set_xticks(x)
ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9)
ax.set_ylabel("Speedup ref → avx2 (×, log scale)")
ax.set_title(
"Hand-Written SIMD Speedup over Compiler-Optimised C\n"
"(Intel Xeon Platinum 8268, 95% bootstrap CI, n≥2000 per group)"
)
ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
ax.set_axisbelow(True)
ax.legend(fontsize=10)
fig.tight_layout()
_save(fig, out_dir, "hand_simd_speedup")
# ---------------------------------------------------------------------------
# Figure 3: Cliff's delta heatmap (ref→avx2)
# ---------------------------------------------------------------------------
def fig_cliffs_heatmap(results: list[dict], out_dir: Path) -> None:
ops_set = sorted(
{r["operation"] for r in results if "ref_to_avx2" in r["comparisons"]},
key=lambda op: -max(
r["comparisons"]["ref_to_avx2"]["cliffs_delta"]
for r in results
if r["operation"] == op and "ref_to_avx2" in r["comparisons"]
),
)
short_ops = [OP_SHORT.get(op, op) for op in ops_set]
data = np.full((len(ALGORITHMS), len(ops_set)), np.nan)
for i, alg in enumerate(ALGORITHMS):
for j, op in enumerate(ops_set):
match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
if match and "ref_to_avx2" in match[0]["comparisons"]:
data[i, j] = match[0]["comparisons"]["ref_to_avx2"]["cliffs_delta"]
n_ops = len(ops_set)
fig, ax = plt.subplots(figsize=(max(10, n_ops * 0.85), 3.2))
im = ax.imshow(data, aspect="auto", cmap="RdYlGn", vmin=-1, vmax=1)
plt.colorbar(im, ax=ax, label="Cliff's δ", fraction=0.03, pad=0.02)
ax.set_yticks(range(len(ALGORITHMS)))
ax.set_yticklabels([ALG_TITLES[a] for a in ALGORITHMS], fontsize=10)
ax.set_xticks(range(n_ops))
ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9)
ax.set_title(
"Cliff's δ (ref vs. avx2) δ = +1.00: avx2 strictly faster in every observation pair",
fontsize=10,
)
for i in range(len(ALGORITHMS)):
for j in range(n_ops):
if not np.isnan(data[i, j]):
# White text on dark green cells, black elsewhere
text_color = "white" if data[i, j] > 0.85 else "black"
ax.text(j, i, f"{data[i, j]:+.3f}", ha="center", va="center",
fontsize=9, color=text_color, fontweight="bold")
fig.tight_layout()
_save(fig, out_dir, "cliffs_delta_heatmap")
# ---------------------------------------------------------------------------
# Figure 4: cycle distribution overlays (requires raw aggregator JSON)
#
# Three panels: one high-speedup op, one medium, one low.
# Each panel overlays ref and avx2 histograms + KDE for mlkem512.
# Log x-axis exposes the scale difference honestly.
# ---------------------------------------------------------------------------
def fig_distributions(raw_records: list[dict], out_dir: Path, alg: str = "mlkem512") -> None:
from scipy.stats import gaussian_kde
# Build lookup: (alg, variant, op) → raw array
raw: dict[tuple, np.ndarray] = {}
for r in raw_records:
if r.get("raw"):
raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64)
n_ops = len(DIST_OPS)
fig, axes = plt.subplots(1, n_ops, figsize=(5 * n_ops, 4))
variant_style = {
"ref": {"color": "#4C72B0", "label": "ref (O3)", "alpha": 0.55, "zorder": 2},
"avx2": {"color": "#C44E52", "label": "avx2", "alpha": 0.65, "zorder": 3},
}
for ax, (op, subtitle) in zip(axes, DIST_OPS):
plotted_any = False
for variant in ("ref", "avx2"):
arr = raw.get((alg, variant, op))
if arr is None:
continue
plotted_any = True
s = variant_style[variant]
# Histogram on log scale
log_arr = np.log10(arr)
lo, hi = np.floor(log_arr.min()), np.ceil(log_arr.max())
bins = np.logspace(lo, hi, 60)
ax.hist(arr, bins=bins, density=True, color=s["color"],
alpha=s["alpha"], zorder=s["zorder"], label=s["label"])
# KDE on log scale, back-transformed
kde = gaussian_kde(log_arr, bw_method=0.25)
xs_log = np.linspace(lo, hi, 400)
xs = 10 ** xs_log
# KDE is in log space; convert density: p(x) = p(log x) / (x ln10)
ys = kde(xs_log) / (xs * np.log(10))
ax.plot(xs, ys, color=s["color"], linewidth=1.8, zorder=s["zorder"] + 1)
# Median line
med = float(np.median(arr))
ax.axvline(med, color=s["color"], linewidth=1.2, linestyle="--", zorder=5)
if not plotted_any:
ax.set_visible(False)
continue
ax.set_xscale("log")
ax.set_xlabel("Cycles (log scale)")
ax.set_ylabel("Density" if op == DIST_OPS[0][0] else "")
ax.set_title(subtitle, fontsize=10)
ax.legend(fontsize=9)
ax.xaxis.set_major_formatter(ticker.LogFormatterSciNotation(labelOnlyBase=False))
ax.grid(axis="x", which="both", linestyle="--", linewidth=0.4, alpha=0.4)
ax.set_axisbelow(True)
fig.suptitle(
f"Cycle Count Distributions — ref vs. avx2 ({ALG_TITLES[alg]})\n"
"Dashed lines show medians. Distributions are right-skewed → nonparametric statistics.",
fontsize=10,
)
fig.tight_layout()
_save(fig, out_dir, "distributions")
# ---------------------------------------------------------------------------
# Figure 5: cross-param speedup consistency
#
# For per-polynomial operations the polynomial dimension is always 256,
# independent of the security parameter k. Speedups should be identical
# across mlkem512/768/1024. This figure verifies that.
# ---------------------------------------------------------------------------
def fig_cross_param(results: list[dict], out_dir: Path) -> None:
ops = CROSS_PARAM_OPS
short = [OP_SHORT.get(op, op) for op in ops]
x = np.arange(len(ops))
bar_w = 0.22
offsets = np.array([-bar_w, 0, bar_w])
colors = ["#4C72B0", "#55A868", "#C44E52"]
fig, ax = plt.subplots(figsize=(8, 4))
for alg, offset, color in zip(ALGORITHMS, offsets, colors):
vals, ci_lo, ci_hi = [], [], []
for op in ops:
match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
if match and "ref_to_avx2" in match[0]["comparisons"]:
c = match[0]["comparisons"]["ref_to_avx2"]
vals.append(c["speedup"])
ci_lo.append(c["ci95"][0])
ci_hi.append(c["ci95"][1])
else:
vals.append(0); ci_lo.append(0); ci_hi.append(0)
vals = np.array(vals)
ci_lo = np.array(ci_lo)
ci_hi = np.array(ci_hi)
yerr = np.array([vals - ci_lo, ci_hi - vals])
mask = vals > 0
ax.bar(x[mask] + offset, vals[mask], bar_w,
label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3)
ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4)
ax.set_xticks(x)
ax.set_xticklabels(short, fontsize=11)
ax.set_ylabel("Speedup ref → avx2 (×)")
ax.set_title(
"Per-Polynomial Operation Speedup Across Security Parameters\n"
"(polynomial dim = 256 for all; NTT variation attributed to cache-state differences)"
)
ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
ax.set_axisbelow(True)
ax.legend(fontsize=10)
fig.tight_layout()
_save(fig, out_dir, "cross_param")
# ---------------------------------------------------------------------------
# Figure S1: KEM-level end-to-end speedup (supplementary)
# ---------------------------------------------------------------------------
def fig_kem_level(results: list[dict], out_dir: Path) -> None:
ops = KEM_OPS
short = [KEM_SHORT[op] for op in ops]
x = np.arange(len(ops))
bar_w = 0.22
offsets = np.array([-bar_w, 0, bar_w])
colors = ["#4C72B0", "#55A868", "#C44E52"]
fig, ax = plt.subplots(figsize=(7, 4))
for alg, offset, color in zip(ALGORITHMS, offsets, colors):
vals, ci_lo, ci_hi = [], [], []
for op in ops:
match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
if match and "ref_to_avx2" in match[0]["comparisons"]:
c = match[0]["comparisons"]["ref_to_avx2"]
vals.append(c["speedup"])
ci_lo.append(c["ci95"][0])
ci_hi.append(c["ci95"][1])
else:
vals.append(0); ci_lo.append(0); ci_hi.append(0)
vals = np.array(vals)
ci_lo = np.array(ci_lo)
ci_hi = np.array(ci_hi)
yerr = np.array([vals - ci_lo, ci_hi - vals])
mask = vals > 0
ax.bar(x[mask] + offset, vals[mask], bar_w,
label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3)
ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4)
ax.set_xticks(x)
ax.set_xticklabels(short, fontsize=12)
ax.set_ylabel("Speedup ref → avx2 (×)")
ax.set_title(
"End-to-End KEM Speedup (ref → avx2)\n"
"(Intel Xeon Platinum 8268, 95% bootstrap CI)"
)
ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
ax.set_axisbelow(True)
ax.legend(fontsize=10)
fig.tight_layout()
_save(fig, out_dir, "kem_level")
# ---------------------------------------------------------------------------
# Shared save helper
# ---------------------------------------------------------------------------
def _save(fig: plt.Figure, out_dir: Path, stem: str) -> None:
fig.savefig(out_dir / f"{stem}.pdf", bbox_inches="tight")
fig.savefig(out_dir / f"{stem}.png", bbox_inches="tight", dpi=150)
print(f"Saved {out_dir}/{stem}.{{pdf,png}}")
plt.close(fig)
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--json", default="analysis/results.json",
help="analyzed results JSON (from analyze.py)")
parser.add_argument("--raw-json", default=None,
help="raw aggregator JSON (from aggregate --raw); required for --distributions")
parser.add_argument("--out", default="analysis/figures")
args = parser.parse_args()
out_dir = Path(args.out)
out_dir.mkdir(parents=True, exist_ok=True)
results = load(args.json)
print(f"Loaded {len(results)} result rows.")
fig_decomposition(results, out_dir)
fig_hand_simd(results, out_dir)
fig_cliffs_heatmap(results, out_dir)
fig_cross_param(results, out_dir)
fig_kem_level(results, out_dir)
if args.raw_json:
raw_records = load(args.raw_json)
print(f"Loaded {len(raw_records)} raw groups for distributions.")
fig_distributions(raw_records, out_dir)
else:
print("Skipping distributions figure (pass --raw-json to enable).")
if __name__ == "__main__":
main()

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 116 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

3
analysis/go.mod Normal file
View File

@ -0,0 +1,3 @@
module git.levineuwirth.org/neuwirth/where-simd-helps/analysis
go 1.26.1

189
analysis/pkg/parse/parse.go Normal file
View File

@ -0,0 +1,189 @@
// Package parse reads pqc-bench .out files produced by the SLURM harness.
//
// Each file contains a SLURM prolog header followed by 1N "loop spin" blocks.
// Each spin block reports one median+average pair per benchmarked operation.
package parse
import (
"bufio"
"fmt"
"os"
"strconv"
"strings"
)
// Meta holds the SLURM prolog metadata extracted from the file header.
type Meta struct {
JobID string
JobName string
Node string
StartedAt string
Directory string
// Explicit fields emitted by submit.sh for reliable downstream parsing.
BenchVariant string
BenchParam string
BenchNSpins string
}
// Measurement is a single operation's reported statistics for one loop spin.
type Measurement struct {
Median int64
Average int64
}
// Run holds everything parsed from one .out file.
type Run struct {
File string
Meta Meta
// Spins[i] maps operation name → measurement for loop spin i+1.
Spins []map[string]Measurement
}
// ParseFile reads a single .out file and returns a Run.
func ParseFile(path string) (*Run, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
run := &Run{File: path}
scanner := bufio.NewScanner(f)
// Default buffer size is 64KB; lines are short so this is fine.
var currentSpin map[string]Measurement
var currentOp string
var pendingMedian int64
inSpin := false
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// SLURM prolog lines start with ##
if strings.HasPrefix(line, "##") {
parsePrologLine(line, &run.Meta)
continue
}
// New loop spin
if strings.HasPrefix(line, "Loop spin:") {
if inSpin && currentSpin != nil {
run.Spins = append(run.Spins, currentSpin)
}
currentSpin = make(map[string]Measurement)
currentOp = ""
inSpin = true
continue
}
if !inSpin {
continue
}
// Operation name line ends with ':'
if strings.HasSuffix(line, ":") && !strings.HasPrefix(line, "median") && !strings.HasPrefix(line, "average") {
currentOp = strings.TrimSuffix(line, ":")
currentOp = strings.TrimSpace(currentOp)
continue
}
if currentOp == "" {
continue
}
if strings.HasPrefix(line, "median:") {
v, err := parseCycles(line)
if err != nil {
return nil, fmt.Errorf("%s: %w", path, err)
}
pendingMedian = v
continue
}
if strings.HasPrefix(line, "average:") {
avg, err := parseCycles(line)
if err != nil {
return nil, fmt.Errorf("%s: %w", path, err)
}
currentSpin[currentOp] = Measurement{Median: pendingMedian, Average: avg}
currentOp = ""
pendingMedian = 0
continue
}
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("%s: %w", path, err)
}
// Flush last spin
if inSpin && currentSpin != nil {
run.Spins = append(run.Spins, currentSpin)
}
return run, nil
}
// parseCycles extracts the integer from lines like "median: 25194 cycles/ticks".
func parseCycles(line string) (int64, error) {
// Format: "<label>: <N> cycles/ticks"
parts := strings.Fields(line)
if len(parts) < 2 {
return 0, fmt.Errorf("unexpected line format: %q", line)
}
return strconv.ParseInt(parts[1], 10, 64)
}
// parsePrologLine extracts key/value pairs from SLURM header lines.
func parsePrologLine(line string, meta *Meta) {
// Lines look like: "## Job ID : 11233228"
// Strip leading "##" and optional decoration lines ("####...")
trimmed := strings.TrimLeft(line, "#")
trimmed = strings.TrimSpace(trimmed)
key, val, ok := strings.Cut(trimmed, ":")
if !ok {
return
}
key = strings.TrimSpace(key)
val = strings.TrimSpace(val)
switch key {
case "Job ID":
meta.JobID = val
case "Job Name":
meta.JobName = val
case "Nodelist":
meta.Node = val
case "Job Started":
meta.StartedAt = val
case "Directory":
meta.Directory = val
case "BENCH_VARIANT":
meta.BenchVariant = val
case "BENCH_PARAM":
meta.BenchParam = val
case "BENCH_NSPINS":
meta.BenchNSpins = val
}
}
// InferVariant returns the benchmark variant for a run.
//
// Priority:
// 1. Explicit BENCH_VARIANT metadata emitted by submit.sh (most reliable).
// 2. The path segment immediately following "kyber/" in the SLURM Directory
// field (works for old-style runs that ran from inside the kyber tree).
// 3. "unknown" if neither is available.
func InferVariant(meta Meta) string {
if meta.BenchVariant != "" {
return meta.BenchVariant
}
const marker = "kyber/"
idx := strings.LastIndex(meta.Directory, marker)
if idx < 0 {
return "unknown"
}
rest := meta.Directory[idx+len(marker):]
variant, _, _ := strings.Cut(rest, "/")
return variant
}

133
analysis/pkg/stats/stats.go Normal file
View File

@ -0,0 +1,133 @@
// Package stats computes summary statistics over slices of cycle counts.
package stats
import (
"cmp"
"math"
"math/rand/v2"
"slices"
)
const bootstrapN = 10_000
// Summary holds all computed statistics for one (algorithm, variant, operation) group.
type Summary struct {
N int
Mean float64
// Median is the sample median (p50).
Median float64
Std float64
MAD float64
P5 float64
P25 float64
P75 float64
P95 float64
P99 float64
// CI95 is the bootstrapped 95% confidence interval for the median.
CI95 [2]float64
}
// Compute derives all statistics from a sorted (ascending) slice of values.
// The caller must sort the slice before passing it in.
func Compute(sorted []int64) Summary {
n := len(sorted)
if n == 0 {
return Summary{}
}
s := Summary{N: n}
s.Mean = mean(sorted)
s.Median = percentileFromSorted(sorted, 50)
s.Std = stddev(sorted, s.Mean)
s.MAD = mad(sorted, s.Median)
s.P5 = percentileFromSorted(sorted, 5)
s.P25 = percentileFromSorted(sorted, 25)
s.P75 = percentileFromSorted(sorted, 75)
s.P95 = percentileFromSorted(sorted, 95)
s.P99 = percentileFromSorted(sorted, 99)
s.CI95 = bootstrapMedianCI(sorted, bootstrapN)
return s
}
func mean(xs []int64) float64 {
var sum float64
for _, x := range xs {
sum += float64(x)
}
return sum / float64(len(xs))
}
func stddev(xs []int64, m float64) float64 {
var variance float64
for _, x := range xs {
d := float64(x) - m
variance += d * d
}
return math.Sqrt(variance / float64(len(xs)))
}
func mad(sorted []int64, median float64) float64 {
devs := make([]float64, len(sorted))
for i, x := range sorted {
devs[i] = math.Abs(float64(x) - median)
}
slices.Sort(devs)
n := len(devs)
if n%2 == 0 {
return (devs[n/2-1] + devs[n/2]) / 2
}
return devs[n/2]
}
// percentileFromSorted uses linear interpolation (same as numpy's default).
func percentileFromSorted(sorted []int64, p float64) float64 {
n := float64(len(sorted))
if n == 1 {
return float64(sorted[0])
}
rank := p / 100 * (n - 1)
lo := int(math.Floor(rank))
hi := int(math.Ceil(rank))
frac := rank - float64(lo)
return float64(sorted[lo])*(1-frac) + float64(sorted[hi])*frac
}
// bootstrapMedianCI resamples the data bootstrapN times and returns the
// [2.5th, 97.5th] percentile of the bootstrap median distribution.
func bootstrapMedianCI(sorted []int64, iters int) [2]float64 {
n := len(sorted)
buf := make([]int64, n)
medians := make([]float64, iters)
for i := range iters {
for j := range n {
buf[j] = sorted[rand.IntN(n)]
}
slices.Sort(buf)
medians[i] = percentileFromSorted(buf, 50)
}
slices.Sort(medians)
return [2]float64{
percentile64(medians, 2.5),
percentile64(medians, 97.5),
}
}
func percentile64(sorted []float64, p float64) float64 {
n := float64(len(sorted))
if n == 1 {
return sorted[0]
}
rank := p / 100 * (n - 1)
lo := int(math.Floor(rank))
hi := int(math.Ceil(rank))
frac := rank - float64(lo)
return sorted[lo]*(1-frac) + sorted[hi]*frac
}
// SortInt64 sorts a slice of int64 in place (ascending).
func SortInt64(xs []int64) {
slices.SortFunc(xs, cmp.Compare)
}

4382
analysis/results.json Normal file

File diff suppressed because it is too large Load Diff

726962
analysis/results_raw.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
## SLURM PROLOG ###############################################################
## Job ID : 1179894
## Job Name : bench_mlkem1024_avx2
## Nodelist : node2334
## CPUs : 1
## Mem/Node : 256 MB
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
###############################################################################
pid 1627591's current affinity list: 41
## BENCH_VARIANT : avx2
## BENCH_PARAM : 1024
## BENCH_NSPINS : 1000
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem1024_avx2
## BENCH_DATE : 2026-04-02T12:18:20-04:00
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
## PERF_PARANOID : 2
## PAPI_BUILD : OFF
ERROR: binary not found or not executable:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
## SLURM PROLOG ###############################################################
## Job ID : 1179893
## Job Name : bench_mlkem1024_ref
## Nodelist : node2334
## CPUs : 1
## Mem/Node : 256 MB
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
###############################################################################
pid 1627590's current affinity list: 40
## BENCH_VARIANT : ref
## BENCH_PARAM : 1024
## BENCH_NSPINS : 1000
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem1024_ref
## BENCH_DATE : 2026-04-02T12:18:20-04:00
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
## PERF_PARANOID : 2
## PAPI_BUILD : OFF
ERROR: binary not found or not executable:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
## SLURM PROLOG ###############################################################
## Job ID : 1179890
## Job Name : bench_mlkem512_avx2
## Nodelist : node2333
## CPUs : 1
## Mem/Node : 256 MB
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
###############################################################################
pid 2240632's current affinity list: 40
## BENCH_VARIANT : avx2
## BENCH_PARAM : 512
## BENCH_NSPINS : 1000
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem512_avx2
## BENCH_DATE : 2026-04-02T12:18:20-04:00
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
## PERF_PARANOID : 2
## PAPI_BUILD : OFF
ERROR: binary not found or not executable:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
## SLURM PROLOG ###############################################################
## Job ID : 1179889
## Job Name : bench_mlkem512_ref
## Nodelist : node2333
## CPUs : 1
## Mem/Node : 256 MB
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
###############################################################################
pid 2240630's current affinity list: 39
## BENCH_VARIANT : ref
## BENCH_PARAM : 512
## BENCH_NSPINS : 1000
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem512_ref
## BENCH_DATE : 2026-04-02T12:18:20-04:00
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
## PERF_PARANOID : 2
## PAPI_BUILD : OFF
ERROR: binary not found or not executable:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
## SLURM PROLOG ###############################################################
## Job ID : 1179892
## Job Name : bench_mlkem768_avx2
## Nodelist : node2334
## CPUs : 1
## Mem/Node : 256 MB
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
###############################################################################
pid 1627592's current affinity list: 32
## BENCH_VARIANT : avx2
## BENCH_PARAM : 768
## BENCH_NSPINS : 1000
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem768_avx2
## BENCH_DATE : 2026-04-02T12:18:20-04:00
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
## PERF_PARANOID : 2
## PAPI_BUILD : OFF
ERROR: binary not found or not executable:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
## SLURM PROLOG ###############################################################
## Job ID : 1179891
## Job Name : bench_mlkem768_ref
## Nodelist : node2333
## CPUs : 1
## Mem/Node : 256 MB
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
###############################################################################
pid 2240631's current affinity list: 42
## BENCH_VARIANT : ref
## BENCH_PARAM : 768
## BENCH_NSPINS : 1000
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem768_ref
## BENCH_DATE : 2026-04-02T12:18:20-04:00
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
## PERF_PARANOID : 2
## PAPI_BUILD : OFF
ERROR: binary not found or not executable:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,57 +1,157 @@
cmake_minimum_required(VERSION 3.20)
project(pqc-bench C)
project(pqc-bench C ASM)
set(CMAKE_C_STANDARD 11)
# Compiler flags
# Release build with full optimization; override on the command line:
# cmake -DCMAKE_BUILD_TYPE=Debug ..
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
set(CMAKE_C_FLAGS_RELEASE "-O3 -march=native")
set(KYBER_ROOT ${CMAKE_SOURCE_DIR}/../algorithms/kyber)
# Algorithm root (submodule)
# Each target below compiles a variant of test_speed.c against a specific
# algorithm build. Add algorithm libraries as submodule CMake subdirectories
# or via add_library() here as the project grows.
#
# Example layout once kyber submodule is added:
# algorithms/kyber/ref/ static lib kyber512_ref, kyber768_ref, kyber1024_ref
# algorithms/kyber/avx2/ static lib kyber512_avx2, ...
# Helpers shared across variants
# cpucycles / speed_print live in the kyber ref test dir; both variants use
# the same copies (avx2/test/ has identical files).
set(BENCH_HELPERS
${KYBER_ROOT}/ref/test/cpucycles.c
${KYBER_ROOT}/ref/test/speed_print.c
)
# Harness source
set(HARNESS_SRC src/test_speed.c)
set(HARNESS_SRC ${CMAKE_SOURCE_DIR}/src/test_speed.c)
# Build variants
# Uncomment and adjust as algorithm libraries become available.
#
# foreach(PARAM 512 768 1024)
# foreach(VARIANT ref refnv)
# set(TARGET "bench_mlkem${PARAM}_${VARIANT}")
# add_executable(${TARGET} ${HARNESS_SRC})
# target_include_directories(${TARGET} PRIVATE
# ${CMAKE_SOURCE_DIR}/../algorithms/kyber/${VARIANT})
# target_link_libraries(${TARGET} kyber${PARAM}_${VARIANT})
# target_compile_definitions(${TARGET} PRIVATE KYBER_K=${PARAM})
# endforeach()
# endforeach()
# ref sources (pure C, portable)
set(REF_DIR ${KYBER_ROOT}/ref)
set(REF_SOURCES
${REF_DIR}/kem.c
${REF_DIR}/indcpa.c
${REF_DIR}/polyvec.c
${REF_DIR}/poly.c
${REF_DIR}/ntt.c
${REF_DIR}/cbd.c
${REF_DIR}/reduce.c
${REF_DIR}/verify.c
${REF_DIR}/fips202.c
${REF_DIR}/symmetric-shake.c
${REF_DIR}/randombytes.c
)
# avx2 sources (C + x86 assembly)
set(AVX2_DIR ${KYBER_ROOT}/avx2)
set(AVX2_SOURCES
${AVX2_DIR}/kem.c
${AVX2_DIR}/indcpa.c
${AVX2_DIR}/polyvec.c
${AVX2_DIR}/poly.c
${AVX2_DIR}/cbd.c
${AVX2_DIR}/verify.c
${AVX2_DIR}/fips202.c
${AVX2_DIR}/fips202x4.c
${AVX2_DIR}/symmetric-shake.c
${AVX2_DIR}/randombytes.c
${AVX2_DIR}/consts.c
${AVX2_DIR}/rejsample.c
${AVX2_DIR}/fq.S
${AVX2_DIR}/shuffle.S
${AVX2_DIR}/ntt.S
${AVX2_DIR}/invntt.S
${AVX2_DIR}/basemul.S
${AVX2_DIR}/keccak4x/KeccakP-1600-times4-SIMD256.c
)
# KYBER_K mapping
# 512 K=2, 768 K=3, 1024 K=4
set(KYBER_K_512 2)
set(KYBER_K_768 3)
set(KYBER_K_1024 4)
# Build targets
foreach(LEVEL 512 768 1024)
set(K ${KYBER_K_${LEVEL}})
# ref optimised reference (O3, auto-vectorisation enabled)
set(REF_TARGET bench_mlkem${LEVEL}_ref)
add_executable(${REF_TARGET}
${HARNESS_SRC}
${REF_SOURCES}
${BENCH_HELPERS}
)
target_include_directories(${REF_TARGET} PRIVATE
${REF_DIR}
${REF_DIR}/test
)
target_compile_definitions(${REF_TARGET} PRIVATE KYBER_K=${K})
target_compile_options(${REF_TARGET} PRIVATE -O3 -fomit-frame-pointer)
# refnv ref with auto-vectorisation disabled; isolates scalar O3 performance
set(REFNV_TARGET bench_mlkem${LEVEL}_refnv)
add_executable(${REFNV_TARGET}
${HARNESS_SRC}
${REF_SOURCES}
${BENCH_HELPERS}
)
target_include_directories(${REFNV_TARGET} PRIVATE
${REF_DIR}
${REF_DIR}/test
)
target_compile_definitions(${REFNV_TARGET} PRIVATE KYBER_K=${K})
target_compile_options(${REFNV_TARGET} PRIVATE
-O3 -fomit-frame-pointer -fno-tree-vectorize
)
# refo0 ref at -O0; establishes unoptimised baseline
set(REFO0_TARGET bench_mlkem${LEVEL}_refo0)
add_executable(${REFO0_TARGET}
${HARNESS_SRC}
${REF_SOURCES}
${BENCH_HELPERS}
)
target_include_directories(${REFO0_TARGET} PRIVATE
${REF_DIR}
${REF_DIR}/test
)
target_compile_definitions(${REFO0_TARGET} PRIVATE KYBER_K=${K})
target_compile_options(${REFO0_TARGET} PRIVATE -O0)
# avx2 hand-written AVX2 assembly + O3
set(AVX2_TARGET bench_mlkem${LEVEL}_avx2)
add_executable(${AVX2_TARGET}
${HARNESS_SRC}
${AVX2_SOURCES}
${BENCH_HELPERS}
)
target_include_directories(${AVX2_TARGET} PRIVATE
${AVX2_DIR}
${AVX2_DIR}/test
${AVX2_DIR}/keccak4x
)
target_compile_definitions(${AVX2_TARGET} PRIVATE KYBER_K=${K})
target_compile_options(${AVX2_TARGET} PRIVATE
-O3 -fomit-frame-pointer -mavx2 -mbmi2 -mpopcnt -march=native -mtune=native
)
endforeach()
# PAPI (hardware performance counters)
# Optional; enable with -DWITH_PAPI=ON
option(WITH_PAPI "Link against PAPI for hardware counter collection" OFF)
if(WITH_PAPI)
find_library(PAPI_LIB papi REQUIRED)
find_path(PAPI_INCLUDE papi.h REQUIRED)
# Targets that need PAPI:
# target_include_directories(<target> PRIVATE ${PAPI_INCLUDE})
# target_link_libraries(<target> ${PAPI_LIB})
foreach(LEVEL 512 768 1024)
foreach(VARIANT ref refnv refo0 avx2)
set(T bench_mlkem${LEVEL}_${VARIANT})
target_include_directories(${T} PRIVATE ${PAPI_INCLUDE})
target_link_libraries(${T} ${PAPI_LIB})
target_compile_definitions(${T} PRIVATE WITH_PAPI)
endforeach()
endforeach()
endif()
# RAPL energy measurement
# Optional; enable with -DWITH_RAPL=ON (requires root or CAP_SYS_RAWIO)
# Requires root or CAP_SYS_RAWIO on the benchmark node.
option(WITH_RAPL "Enable Intel RAPL energy measurement" OFF)
if(WITH_RAPL)
# target_compile_definitions(<target> PRIVATE WITH_RAPL)
foreach(LEVEL 512 768 1024)
foreach(VARIANT ref refnv refo0 avx2)
target_compile_definitions(bench_mlkem${LEVEL}_${VARIANT} PRIVATE WITH_RAPL)
endforeach()
endforeach()
endif()

View File

@ -0,0 +1,394 @@
# This is the CMakeCache file.
# For build in directory: /home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
# It was generated by CMake: /usr/bin/cmake
# You can edit this file to change values found and used by cmake.
# If you do not want to change any of the values, simply exit the editor.
# If you do want to change a value, simply edit, save, and exit the editor.
# The syntax for the file is as follows:
# KEY:TYPE=VALUE
# KEY is the name of a variable in the cache.
# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
# VALUE is the current value for the KEY.
########################
# EXTERNAL cache entries
########################
//Path to a program.
CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
//Path to a program.
CMAKE_AR:FILEPATH=/usr/bin/ar
//ASM compiler
CMAKE_ASM_COMPILER:FILEPATH=/usr/bin/cc
//A wrapper around 'ar' adding the appropriate '--plugin' option
// for the GCC compiler
CMAKE_ASM_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar
//A wrapper around 'ranlib' adding the appropriate '--plugin' option
// for the GCC compiler
CMAKE_ASM_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib
//Flags used by the ASM compiler during all build types.
CMAKE_ASM_FLAGS:STRING=
//Flags used by the ASM compiler during DEBUG builds.
CMAKE_ASM_FLAGS_DEBUG:STRING=-g
//Flags used by the ASM compiler during MINSIZEREL builds.
CMAKE_ASM_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
//Flags used by the ASM compiler during RELEASE builds.
CMAKE_ASM_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
//Flags used by the ASM compiler during RELWITHDEBINFO builds.
CMAKE_ASM_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
//Choose the type of build, options are: None Debug Release RelWithDebInfo
// MinSizeRel ...
CMAKE_BUILD_TYPE:STRING=Release
//Enable/Disable color output during build.
CMAKE_COLOR_MAKEFILE:BOOL=ON
//C compiler
CMAKE_C_COMPILER:FILEPATH=/usr/bin/cc
//A wrapper around 'ar' adding the appropriate '--plugin' option
// for the GCC compiler
CMAKE_C_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar
//A wrapper around 'ranlib' adding the appropriate '--plugin' option
// for the GCC compiler
CMAKE_C_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib
//Flags used by the C compiler during all build types.
CMAKE_C_FLAGS:STRING=
//Flags used by the C compiler during DEBUG builds.
CMAKE_C_FLAGS_DEBUG:STRING=-g
//Flags used by the C compiler during MINSIZEREL builds.
CMAKE_C_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
//Flags used by the C compiler during RELEASE builds.
CMAKE_C_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
//Flags used by the C compiler during RELWITHDEBINFO builds.
CMAKE_C_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
//Path to a program.
CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
//Flags used by the linker during all build types.
CMAKE_EXE_LINKER_FLAGS:STRING=
//Flags used by the linker during DEBUG builds.
CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
//Flags used by the linker during MINSIZEREL builds.
CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
//Flags used by the linker during RELEASE builds.
CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
//Flags used by the linker during RELWITHDEBINFO builds.
CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
//Enable/Disable output of compile commands during generation.
CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
//Value Computed by CMake.
CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi/CMakeFiles/pkgRedirects
//Install path prefix, prepended onto install directories.
CMAKE_INSTALL_PREFIX:PATH=/usr/local
//Path to a program.
CMAKE_LINKER:FILEPATH=/usr/bin/ld
//Path to a program.
CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/make
//Flags used by the linker during the creation of modules during
// all build types.
CMAKE_MODULE_LINKER_FLAGS:STRING=
//Flags used by the linker during the creation of modules during
// DEBUG builds.
CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
//Flags used by the linker during the creation of modules during
// MINSIZEREL builds.
CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
//Flags used by the linker during the creation of modules during
// RELEASE builds.
CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
//Flags used by the linker during the creation of modules during
// RELWITHDEBINFO builds.
CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
//Path to a program.
CMAKE_NM:FILEPATH=/usr/bin/nm
//Path to a program.
CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
//Path to a program.
CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
//Value Computed by CMake
CMAKE_PROJECT_COMPAT_VERSION:STATIC=
//Value Computed by CMake
CMAKE_PROJECT_DESCRIPTION:STATIC=
//Value Computed by CMake
CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
//Value Computed by CMake
CMAKE_PROJECT_NAME:STATIC=pqc-bench
//Value Computed by CMake
CMAKE_PROJECT_SPDX_LICENSE:STATIC=
//Path to a program.
CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
//Path to a program.
CMAKE_READELF:FILEPATH=/usr/bin/readelf
//Flags used by the linker during the creation of shared libraries
// during all build types.
CMAKE_SHARED_LINKER_FLAGS:STRING=
//Flags used by the linker during the creation of shared libraries
// during DEBUG builds.
CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
//Flags used by the linker during the creation of shared libraries
// during MINSIZEREL builds.
CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
//Flags used by the linker during the creation of shared libraries
// during RELEASE builds.
CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
//Flags used by the linker during the creation of shared libraries
// during RELWITHDEBINFO builds.
CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
//If set, runtime paths are not added when installing shared libraries,
// but are added when building.
CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
//If set, runtime paths are not added when using shared libraries.
CMAKE_SKIP_RPATH:BOOL=NO
//Flags used by the archiver during the creation of static libraries
// during all build types.
CMAKE_STATIC_LINKER_FLAGS:STRING=
//Flags used by the archiver during the creation of static libraries
// during DEBUG builds.
CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
//Flags used by the archiver during the creation of static libraries
// during MINSIZEREL builds.
CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
//Flags used by the archiver during the creation of static libraries
// during RELEASE builds.
CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
//Flags used by the archiver during the creation of static libraries
// during RELWITHDEBINFO builds.
CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
//Path to a program.
CMAKE_STRIP:FILEPATH=/usr/bin/strip
//Path to a program.
CMAKE_TAPI:FILEPATH=CMAKE_TAPI-NOTFOUND
//If this value is on, makefiles will be generated without the
// .SILENT directive, and all commands will be echoed to the console
// during the make. This is useful for debugging only. With Visual
// Studio IDE projects all commands are done without /nologo.
CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
//Path to a library.
PAPI_LIB:FILEPATH=PAPI_LIB-NOTFOUND
//Link against PAPI for hardware counter collection
WITH_PAPI:BOOL=ON
//Value Computed by CMake
pqc-bench_BINARY_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
//Value Computed by CMake
pqc-bench_IS_TOP_LEVEL:STATIC=ON
//Value Computed by CMake
pqc-bench_SOURCE_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness
########################
# INTERNAL cache entries
########################
//ADVANCED property for variable: CMAKE_ADDR2LINE
CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_AR
CMAKE_AR-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_ASM_COMPILER
CMAKE_ASM_COMPILER-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_ASM_COMPILER_AR
CMAKE_ASM_COMPILER_AR-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_ASM_COMPILER_RANLIB
CMAKE_ASM_COMPILER_RANLIB-ADVANCED:INTERNAL=1
CMAKE_ASM_COMPILER_WORKS:INTERNAL=1
//ADVANCED property for variable: CMAKE_ASM_FLAGS
CMAKE_ASM_FLAGS-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_ASM_FLAGS_DEBUG
CMAKE_ASM_FLAGS_DEBUG-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_ASM_FLAGS_MINSIZEREL
CMAKE_ASM_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_ASM_FLAGS_RELEASE
CMAKE_ASM_FLAGS_RELEASE-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_ASM_FLAGS_RELWITHDEBINFO
CMAKE_ASM_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
//This is the directory where this CMakeCache.txt was created
CMAKE_CACHEFILE_DIR:INTERNAL=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
//Major version of cmake used to create the current loaded cache
CMAKE_CACHE_MAJOR_VERSION:INTERNAL=4
//Minor version of cmake used to create the current loaded cache
CMAKE_CACHE_MINOR_VERSION:INTERNAL=3
//Patch version of cmake used to create the current loaded cache
CMAKE_CACHE_PATCH_VERSION:INTERNAL=1
//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
//Path to CMake executable.
CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
//Path to cpack program executable.
CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
//Path to ctest program executable.
CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
//ADVANCED property for variable: CMAKE_C_COMPILER
CMAKE_C_COMPILER-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_C_COMPILER_AR
CMAKE_C_COMPILER_AR-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_C_COMPILER_RANLIB
CMAKE_C_COMPILER_RANLIB-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_C_FLAGS
CMAKE_C_FLAGS-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_DEBUG-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_C_FLAGS_MINSIZEREL
CMAKE_C_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_RELEASE-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_C_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_DLLTOOL
CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
//Path to cache edit program executable.
CMAKE_EDIT_COMMAND:INTERNAL=/usr/bin/ccmake
//Executable file format
CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
//Name of external makefile project generator.
CMAKE_EXTRA_GENERATOR:INTERNAL=
//Name of generator.
CMAKE_GENERATOR:INTERNAL=Unix Makefiles
//Generator instance identifier.
CMAKE_GENERATOR_INSTANCE:INTERNAL=
//Name of generator platform.
CMAKE_GENERATOR_PLATFORM:INTERNAL=
//Name of generator toolset.
CMAKE_GENERATOR_TOOLSET:INTERNAL=
//Source directory with the top level CMakeLists.txt file for this
// project
CMAKE_HOME_DIRECTORY:INTERNAL=/home/jeans/Repos/research/pqc/where-simd-helps/harness
//Install .so files without execute permission.
CMAKE_INSTALL_SO_NO_EXE:INTERNAL=0
//ADVANCED property for variable: CMAKE_LINKER
CMAKE_LINKER-ADVANCED:INTERNAL=1
//Name of CMakeLists files to read
CMAKE_LIST_FILE_NAME:INTERNAL=CMakeLists.txt
//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_NM
CMAKE_NM-ADVANCED:INTERNAL=1
//number of local generators
CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
//ADVANCED property for variable: CMAKE_OBJCOPY
CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_OBJDUMP
CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
//Platform information initialized
CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
//ADVANCED property for variable: CMAKE_RANLIB
CMAKE_RANLIB-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_READELF
CMAKE_READELF-ADVANCED:INTERNAL=1
//Path to CMake installation.
CMAKE_ROOT:INTERNAL=/usr/share/cmake
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_SKIP_RPATH
CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_STRIP
CMAKE_STRIP-ADVANCED:INTERNAL=1
//ADVANCED property for variable: CMAKE_TAPI
CMAKE_TAPI-ADVANCED:INTERNAL=1
//uname command
CMAKE_UNAME:INTERNAL=/usr/bin/uname
//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1

View File

@ -0,0 +1,30 @@
set(CMAKE_ASM_COMPILER "/usr/bin/cc")
set(CMAKE_ASM_COMPILER_ARG1 "")
set(CMAKE_AR "/usr/bin/ar")
set(CMAKE_ASM_COMPILER_AR "/usr/bin/gcc-ar")
set(CMAKE_RANLIB "/usr/bin/ranlib")
set(CMAKE_ASM_COMPILER_RANLIB "/usr/bin/gcc-ranlib")
set(CMAKE_LINKER "/usr/bin/ld")
set(CMAKE_LINKER_LINK "")
set(CMAKE_LINKER_LLD "")
set(CMAKE_ASM_COMPILER_LINKER "")
set(CMAKE_ASM_COMPILER_LINKER_ID "")
set(CMAKE_ASM_COMPILER_LINKER_VERSION )
set(CMAKE_ASM_COMPILER_LINKER_FRONTEND_VARIANT )
set(CMAKE_MT "")
set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
set(CMAKE_ASM_COMPILER_LOADED 1)
set(CMAKE_ASM_COMPILER_ID "GNU")
set(CMAKE_ASM_COMPILER_VERSION "")
set(CMAKE_ASM_COMPILER_ENV_VAR "ASM")
set(CMAKE_ASM_COMPILER_ARCHITECTURE_ID "")
set(CMAKE_ASM_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
set(CMAKE_ASM_LINKER_PREFERENCE 0)
set(CMAKE_ASM_LINKER_DEPFILE_SUPPORTED )
set(CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
set(CMAKE_ASM_LINKER_PUSHPOP_STATE_SUPPORTED )

View File

@ -0,0 +1,85 @@
set(CMAKE_C_COMPILER "/usr/bin/cc")
set(CMAKE_C_COMPILER_ARG1 "")
set(CMAKE_C_COMPILER_ID "GNU")
set(CMAKE_C_COMPILER_VERSION "15.2.1")
set(CMAKE_C_COMPILER_VERSION_INTERNAL "")
set(CMAKE_C_COMPILER_WRAPPER "")
set(CMAKE_C_STANDARD_COMPUTED_DEFAULT "23")
set(CMAKE_C_EXTENSIONS_COMPUTED_DEFAULT "ON")
set(CMAKE_C_STANDARD_LATEST "23")
set(CMAKE_C_COMPILE_FEATURES "c_std_90;c_function_prototypes;c_std_99;c_restrict;c_variadic_macros;c_std_11;c_static_assert;c_std_17;c_std_23")
set(CMAKE_C90_COMPILE_FEATURES "c_std_90;c_function_prototypes")
set(CMAKE_C99_COMPILE_FEATURES "c_std_99;c_restrict;c_variadic_macros")
set(CMAKE_C11_COMPILE_FEATURES "c_std_11;c_static_assert")
set(CMAKE_C17_COMPILE_FEATURES "c_std_17")
set(CMAKE_C23_COMPILE_FEATURES "c_std_23")
set(CMAKE_C_PLATFORM_ID "Linux")
set(CMAKE_C_SIMULATE_ID "")
set(CMAKE_C_COMPILER_FRONTEND_VARIANT "GNU")
set(CMAKE_C_COMPILER_APPLE_SYSROOT "")
set(CMAKE_C_SIMULATE_VERSION "")
set(CMAKE_C_COMPILER_ARCHITECTURE_ID "x86_64")
set(CMAKE_AR "/usr/bin/ar")
set(CMAKE_C_COMPILER_AR "/usr/bin/gcc-ar")
set(CMAKE_RANLIB "/usr/bin/ranlib")
set(CMAKE_C_COMPILER_RANLIB "/usr/bin/gcc-ranlib")
set(CMAKE_LINKER "/usr/bin/ld")
set(CMAKE_LINKER_LINK "")
set(CMAKE_LINKER_LLD "")
set(CMAKE_C_COMPILER_LINKER "/usr/bin/ld")
set(CMAKE_C_COMPILER_LINKER_ID "GNU")
set(CMAKE_C_COMPILER_LINKER_VERSION 2.46)
set(CMAKE_C_COMPILER_LINKER_FRONTEND_VARIANT GNU)
set(CMAKE_MT "")
set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
set(CMAKE_COMPILER_IS_GNUCC 1)
set(CMAKE_C_COMPILER_LOADED 1)
set(CMAKE_C_COMPILER_WORKS TRUE)
set(CMAKE_C_ABI_COMPILED TRUE)
set(CMAKE_C_COMPILER_ENV_VAR "CC")
set(CMAKE_C_COMPILER_ID_RUN 1)
set(CMAKE_C_SOURCE_FILE_EXTENSIONS c;m)
set(CMAKE_C_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
set(CMAKE_C_LINKER_PREFERENCE 10)
set(CMAKE_C_LINKER_DEPFILE_SUPPORTED TRUE)
set(CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
set(CMAKE_C_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
# Save compiler ABI information.
set(CMAKE_C_SIZEOF_DATA_PTR "8")
set(CMAKE_C_COMPILER_ABI "ELF")
set(CMAKE_C_BYTE_ORDER "LITTLE_ENDIAN")
set(CMAKE_C_LIBRARY_ARCHITECTURE "")
if(CMAKE_C_SIZEOF_DATA_PTR)
set(CMAKE_SIZEOF_VOID_P "${CMAKE_C_SIZEOF_DATA_PTR}")
endif()
if(CMAKE_C_COMPILER_ABI)
set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_C_COMPILER_ABI}")
endif()
if(CMAKE_C_LIBRARY_ARCHITECTURE)
set(CMAKE_LIBRARY_ARCHITECTURE "")
endif()
set(CMAKE_C_CL_SHOWINCLUDES_PREFIX "")
if(CMAKE_C_CL_SHOWINCLUDES_PREFIX)
set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_C_CL_SHOWINCLUDES_PREFIX}")
endif()
set(CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES "/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1/include;/usr/local/include;/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1/include-fixed;/usr/include")
set(CMAKE_C_IMPLICIT_LINK_LIBRARIES "gcc;gcc_s;c;gcc;gcc_s")
set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1;/usr/lib;/lib")
set(CMAKE_C_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")

View File

@ -0,0 +1,15 @@
set(CMAKE_HOST_SYSTEM "Linux-6.19.10-arch1-1")
set(CMAKE_HOST_SYSTEM_NAME "Linux")
set(CMAKE_HOST_SYSTEM_VERSION "6.19.10-arch1-1")
set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
set(CMAKE_SYSTEM "Linux-6.19.10-arch1-1")
set(CMAKE_SYSTEM_NAME "Linux")
set(CMAKE_SYSTEM_VERSION "6.19.10-arch1-1")
set(CMAKE_SYSTEM_PROCESSOR "x86_64")
set(CMAKE_CROSSCOMPILING "FALSE")
set(CMAKE_SYSTEM_LOADED 1)

View File

@ -0,0 +1,934 @@
#ifdef __cplusplus
# error "A C++ compiler has been selected for C."
#endif
#if defined(__18CXX)
# define ID_VOID_MAIN
#endif
#if defined(__CLASSIC_C__)
/* cv-qualifiers did not exist in K&R C */
# define const
# define volatile
#endif
#if !defined(__has_include)
/* If the compiler does not have __has_include, pretend the answer is
always no. */
# define __has_include(x) 0
#endif
/* Version number components: V=Version, R=Revision, P=Patch
Version date components: YYYY=Year, MM=Month, DD=Day */
#if defined(__INTEL_COMPILER) || defined(__ICC)
# define COMPILER_ID "Intel"
# if defined(_MSC_VER)
# define SIMULATE_ID "MSVC"
# endif
# if defined(__GNUC__)
# define SIMULATE_ID "GNU"
# endif
/* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
except that a few beta releases use the old format with V=2021. */
# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
# if defined(__INTEL_COMPILER_UPDATE)
# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
# else
# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10)
# endif
# else
# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
/* The third version component from --version is an update index,
but no macro is provided for it. */
# define COMPILER_VERSION_PATCH DEC(0)
# endif
# if defined(__INTEL_COMPILER_BUILD_DATE)
/* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
# endif
# if defined(_MSC_VER)
/* _MSC_VER = VVRR */
# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
# endif
# if defined(__GNUC__)
# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
# elif defined(__GNUG__)
# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
# endif
# if defined(__GNUC_MINOR__)
# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
# endif
# if defined(__GNUC_PATCHLEVEL__)
# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
# endif
#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
# define COMPILER_ID "IntelLLVM"
#if defined(_MSC_VER)
# define SIMULATE_ID "MSVC"
#endif
#if defined(__GNUC__)
# define SIMULATE_ID "GNU"
#endif
/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
* later. Look for 6 digit vs. 8 digit version number to decide encoding.
* VVVV is no smaller than the current year when a version is released.
*/
#if __INTEL_LLVM_COMPILER < 1000000L
# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 10)
#else
# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 100)
#endif
#if defined(_MSC_VER)
/* _MSC_VER = VVRR */
# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
#endif
#if defined(__GNUC__)
# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
#elif defined(__GNUG__)
# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
#endif
#if defined(__GNUC_MINOR__)
# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
#endif
#if defined(__GNUC_PATCHLEVEL__)
# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
#endif
#elif defined(__PATHCC__)
# define COMPILER_ID "PathScale"
# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
# if defined(__PATHCC_PATCHLEVEL__)
# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
# endif
#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
# define COMPILER_ID "Embarcadero"
# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF)
#elif defined(__BORLANDC__)
# define COMPILER_ID "Borland"
/* __BORLANDC__ = 0xVRR */
# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
# define COMPILER_ID "Watcom"
/* __WATCOMC__ = VVRR */
# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
# if (__WATCOMC__ % 10) > 0
# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
# endif
#elif defined(__WATCOMC__)
# define COMPILER_ID "OpenWatcom"
/* __WATCOMC__ = VVRP + 1100 */
# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
# if (__WATCOMC__ % 10) > 0
# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
# endif
#elif defined(__SUNPRO_C)
# define COMPILER_ID "SunPro"
# if __SUNPRO_C >= 0x5100
/* __SUNPRO_C = 0xVRRP */
# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>12)
# define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xFF)
# define COMPILER_VERSION_PATCH HEX(__SUNPRO_C & 0xF)
# else
/* __SUNPRO_CC = 0xVRP */
# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>8)
# define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xF)
# define COMPILER_VERSION_PATCH HEX(__SUNPRO_C & 0xF)
# endif
#elif defined(__HP_cc)
# define COMPILER_ID "HP"
/* __HP_cc = VVRRPP */
# define COMPILER_VERSION_MAJOR DEC(__HP_cc/10000)
# define COMPILER_VERSION_MINOR DEC(__HP_cc/100 % 100)
# define COMPILER_VERSION_PATCH DEC(__HP_cc % 100)
#elif defined(__DECC)
# define COMPILER_ID "Compaq"
/* __DECC_VER = VVRRTPPPP */
# define COMPILER_VERSION_MAJOR DEC(__DECC_VER/10000000)
# define COMPILER_VERSION_MINOR DEC(__DECC_VER/100000 % 100)
# define COMPILER_VERSION_PATCH DEC(__DECC_VER % 10000)
#elif defined(__IBMC__) && defined(__COMPILER_VER__)
# define COMPILER_ID "zOS"
/* __IBMC__ = VRP */
# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
#elif defined(__open_xl__) && defined(__clang__)
# define COMPILER_ID "IBMClang"
# define COMPILER_VERSION_MAJOR DEC(__open_xl_version__)
# define COMPILER_VERSION_MINOR DEC(__open_xl_release__)
# define COMPILER_VERSION_PATCH DEC(__open_xl_modification__)
# define COMPILER_VERSION_TWEAK DEC(__open_xl_ptf_fix_level__)
# define COMPILER_VERSION_INTERNAL_STR __clang_version__
#elif defined(__ibmxl__) && defined(__clang__)
# define COMPILER_ID "XLClang"
# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800
# define COMPILER_ID "XL"
/* __IBMC__ = VRP */
# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800
# define COMPILER_ID "VisualAge"
/* __IBMC__ = VRP */
# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
#elif defined(__NVCOMPILER)
# define COMPILER_ID "NVHPC"
# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
# if defined(__NVCOMPILER_PATCHLEVEL__)
# define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
# endif
#elif defined(__PGI)
# define COMPILER_ID "PGI"
# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
# if defined(__PGIC_PATCHLEVEL__)
# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
# endif
#elif defined(__clang__) && defined(__cray__)
# define COMPILER_ID "CrayClang"
# define COMPILER_VERSION_MAJOR DEC(__cray_major__)
# define COMPILER_VERSION_MINOR DEC(__cray_minor__)
# define COMPILER_VERSION_PATCH DEC(__cray_patchlevel__)
# define COMPILER_VERSION_INTERNAL_STR __clang_version__
#elif defined(_CRAYC)
# define COMPILER_ID "Cray"
# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
#elif defined(__TI_COMPILER_VERSION__)
# define COMPILER_ID "TI"
/* __TI_COMPILER_VERSION__ = VVVRRRPPP */
# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000)
# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000)
#elif defined(__CLANG_FUJITSU)
# define COMPILER_ID "FujitsuClang"
# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
# define COMPILER_VERSION_INTERNAL_STR __clang_version__
#elif defined(__FUJITSU)
# define COMPILER_ID "Fujitsu"
# if defined(__FCC_version__)
# define COMPILER_VERSION __FCC_version__
# elif defined(__FCC_major__)
# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
# endif
# if defined(__fcc_version)
# define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
# elif defined(__FCC_VERSION)
# define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
# endif
#elif defined(__ghs__)
# define COMPILER_ID "GHS"
/* __GHS_VERSION_NUMBER = VVVVRP */
# ifdef __GHS_VERSION_NUMBER
# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER % 10)
# endif
#elif defined(__TASKING__)
# define COMPILER_ID "Tasking"
# define COMPILER_VERSION_MAJOR DEC(__VERSION__/1000)
# define COMPILER_VERSION_MINOR DEC(__VERSION__ % 100)
# define COMPILER_VERSION_INTERNAL DEC(__VERSION__)
#elif defined(__ORANGEC__)
# define COMPILER_ID "OrangeC"
# define COMPILER_VERSION_MAJOR DEC(__ORANGEC_MAJOR__)
# define COMPILER_VERSION_MINOR DEC(__ORANGEC_MINOR__)
# define COMPILER_VERSION_PATCH DEC(__ORANGEC_PATCHLEVEL__)
#elif defined(__RENESAS__)
# define COMPILER_ID "Renesas"
/* __RENESAS_VERSION__ = 0xVVRRPP00 */
# define COMPILER_VERSION_MAJOR HEX(__RENESAS_VERSION__ >> 24 & 0xFF)
# define COMPILER_VERSION_MINOR HEX(__RENESAS_VERSION__ >> 16 & 0xFF)
# define COMPILER_VERSION_PATCH HEX(__RENESAS_VERSION__ >> 8 & 0xFF)
#elif defined(__TINYC__)
# define COMPILER_ID "TinyCC"
#elif defined(__BCC__)
# define COMPILER_ID "Bruce"
#elif defined(__SCO_VERSION__)
# define COMPILER_ID "SCO"
#elif defined(__ARMCC_VERSION) && !defined(__clang__)
# define COMPILER_ID "ARMCC"
#if __ARMCC_VERSION >= 1000000
/* __ARMCC_VERSION = VRRPPPP */
# define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
# define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
# define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000)
#else
/* __ARMCC_VERSION = VRPPPP */
# define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
# define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
# define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000)
#endif
#elif defined(__clang__) && defined(__apple_build_version__)
# define COMPILER_ID "AppleClang"
# if defined(_MSC_VER)
# define SIMULATE_ID "MSVC"
# endif
# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
# if defined(_MSC_VER)
/* _MSC_VER = VVRR */
# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
# endif
# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
# define COMPILER_ID "ARMClang"
# define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
# define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
# define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION/100 % 100)
# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
#elif defined(__clang__) && defined(__ti__)
# define COMPILER_ID "TIClang"
# define COMPILER_VERSION_MAJOR DEC(__ti_major__)
# define COMPILER_VERSION_MINOR DEC(__ti_minor__)
# define COMPILER_VERSION_PATCH DEC(__ti_patchlevel__)
# define COMPILER_VERSION_INTERNAL DEC(__ti_version__)
#elif defined(__clang__)
# define COMPILER_ID "Clang"
# if defined(_MSC_VER)
# define SIMULATE_ID "MSVC"
# endif
# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
# if defined(_MSC_VER)
/* _MSC_VER = VVRR */
# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
# endif
#elif defined(__LCC__) && (defined(__GNUC__) || defined(__GNUG__) || defined(__MCST__))
# define COMPILER_ID "LCC"
# define COMPILER_VERSION_MAJOR DEC(__LCC__ / 100)
# define COMPILER_VERSION_MINOR DEC(__LCC__ % 100)
# if defined(__LCC_MINOR__)
# define COMPILER_VERSION_PATCH DEC(__LCC_MINOR__)
# endif
# if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define SIMULATE_ID "GNU"
# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
# if defined(__GNUC_PATCHLEVEL__)
# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
# endif
# endif
#elif defined(__GNUC__)
# define COMPILER_ID "GNU"
# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
# if defined(__GNUC_MINOR__)
# define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
# endif
# if defined(__GNUC_PATCHLEVEL__)
# define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
# endif
#elif defined(_MSC_VER)
# define COMPILER_ID "MSVC"
/* _MSC_VER = VVRR */
# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
# if defined(_MSC_FULL_VER)
# if _MSC_VER >= 1400
/* _MSC_FULL_VER = VVRRPPPPP */
# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
# else
/* _MSC_FULL_VER = VVRRPPPP */
# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
# endif
# endif
# if defined(_MSC_BUILD)
# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
# endif
#elif defined(_ADI_COMPILER)
# define COMPILER_ID "ADSP"
#if defined(__VERSIONNUM__)
/* __VERSIONNUM__ = 0xVVRRPPTT */
# define COMPILER_VERSION_MAJOR DEC(__VERSIONNUM__ >> 24 & 0xFF)
# define COMPILER_VERSION_MINOR DEC(__VERSIONNUM__ >> 16 & 0xFF)
# define COMPILER_VERSION_PATCH DEC(__VERSIONNUM__ >> 8 & 0xFF)
# define COMPILER_VERSION_TWEAK DEC(__VERSIONNUM__ & 0xFF)
#endif
#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
# define COMPILER_ID "IAR"
# if defined(__VER__) && defined(__ICCARM__)
# define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
# define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
# define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
# define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
# define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
# define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
# endif
#elif defined(__DCC__) && defined(_DIAB_TOOL)
# define COMPILER_ID "Diab"
# define COMPILER_VERSION_MAJOR DEC(__VERSION_MAJOR_NUMBER__)
# define COMPILER_VERSION_MINOR DEC(__VERSION_MINOR_NUMBER__)
# define COMPILER_VERSION_PATCH DEC(__VERSION_ARCH_FEATURE_NUMBER__)
# define COMPILER_VERSION_TWEAK DEC(__VERSION_BUG_FIX_NUMBER__)
#elif defined(__SDCC_VERSION_MAJOR) || defined(SDCC)
# define COMPILER_ID "SDCC"
# if defined(__SDCC_VERSION_MAJOR)
# define COMPILER_VERSION_MAJOR DEC(__SDCC_VERSION_MAJOR)
# define COMPILER_VERSION_MINOR DEC(__SDCC_VERSION_MINOR)
# define COMPILER_VERSION_PATCH DEC(__SDCC_VERSION_PATCH)
# else
/* SDCC = VRP */
# define COMPILER_VERSION_MAJOR DEC(SDCC/100)
# define COMPILER_VERSION_MINOR DEC(SDCC/10 % 10)
# define COMPILER_VERSION_PATCH DEC(SDCC % 10)
# endif
/* These compilers are either not known or too old to define an
identification macro. Try to identify the platform and guess that
it is the native compiler. */
#elif defined(__hpux) || defined(__hpua)
# define COMPILER_ID "HP"
#else /* unknown compiler */
# define COMPILER_ID ""
#endif
/* Construct the string literal in pieces to prevent the source from
getting matched. Store it in a pointer rather than an array
because some compilers will just produce instructions to fill the
array rather than assigning a pointer to a static array. */
char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
#ifdef SIMULATE_ID
char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
#endif
#ifdef __QNXNTO__
char const* qnxnto = "INFO" ":" "qnxnto[]";
#endif
#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
#endif
#define STRINGIFY_HELPER(X) #X
#define STRINGIFY(X) STRINGIFY_HELPER(X)
/* Identify known platforms by name. */
#if defined(__linux) || defined(__linux__) || defined(linux)
# define PLATFORM_ID "Linux"
#elif defined(__MSYS__)
# define PLATFORM_ID "MSYS"
#elif defined(__CYGWIN__)
# define PLATFORM_ID "Cygwin"
#elif defined(__MINGW32__)
# define PLATFORM_ID "MinGW"
#elif defined(__APPLE__)
# define PLATFORM_ID "Darwin"
#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
# define PLATFORM_ID "Windows"
#elif defined(__FreeBSD__) || defined(__FreeBSD)
# define PLATFORM_ID "FreeBSD"
#elif defined(__NetBSD__) || defined(__NetBSD)
# define PLATFORM_ID "NetBSD"
#elif defined(__OpenBSD__) || defined(__OPENBSD)
# define PLATFORM_ID "OpenBSD"
#elif defined(__sun) || defined(sun)
# define PLATFORM_ID "SunOS"
#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
# define PLATFORM_ID "AIX"
#elif defined(__hpux) || defined(__hpux__)
# define PLATFORM_ID "HP-UX"
#elif defined(__HAIKU__)
# define PLATFORM_ID "Haiku"
#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
# define PLATFORM_ID "BeOS"
#elif defined(__QNX__) || defined(__QNXNTO__)
# define PLATFORM_ID "QNX"
#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
# define PLATFORM_ID "Tru64"
#elif defined(__riscos) || defined(__riscos__)
# define PLATFORM_ID "RISCos"
#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
# define PLATFORM_ID "SINIX"
#elif defined(__UNIX_SV__)
# define PLATFORM_ID "UNIX_SV"
#elif defined(__bsdos__)
# define PLATFORM_ID "BSDOS"
#elif defined(_MPRAS) || defined(MPRAS)
# define PLATFORM_ID "MP-RAS"
#elif defined(__osf) || defined(__osf__)
# define PLATFORM_ID "OSF1"
#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
# define PLATFORM_ID "SCO_SV"
#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
# define PLATFORM_ID "ULTRIX"
#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
# define PLATFORM_ID "Xenix"
#elif defined(__WATCOMC__)
# if defined(__LINUX__)
# define PLATFORM_ID "Linux"
# elif defined(__DOS__)
# define PLATFORM_ID "DOS"
# elif defined(__OS2__)
# define PLATFORM_ID "OS2"
# elif defined(__WINDOWS__)
# define PLATFORM_ID "Windows3x"
# elif defined(__VXWORKS__)
# define PLATFORM_ID "VxWorks"
# else /* unknown platform */
# define PLATFORM_ID
# endif
#elif defined(__INTEGRITY)
# if defined(INT_178B)
# define PLATFORM_ID "Integrity178"
# else /* regular Integrity */
# define PLATFORM_ID "Integrity"
# endif
# elif defined(_ADI_COMPILER)
# define PLATFORM_ID "ADSP"
#else /* unknown platform */
# define PLATFORM_ID
#endif
/* For windows compilers MSVC and Intel we can determine
the architecture of the compiler being used. This is because
the compilers do not have flags that can change the architecture,
but rather depend on which compiler is being used
*/
#if defined(_WIN32) && defined(_MSC_VER)
# if defined(_M_IA64)
# define ARCHITECTURE_ID "IA64"
# elif defined(_M_ARM64EC)
# define ARCHITECTURE_ID "ARM64EC"
# elif defined(_M_X64) || defined(_M_AMD64)
# define ARCHITECTURE_ID "x64"
# elif defined(_M_IX86)
# define ARCHITECTURE_ID "X86"
# elif defined(_M_ARM64)
# define ARCHITECTURE_ID "ARM64"
# elif defined(_M_ARM)
# if _M_ARM == 4
# define ARCHITECTURE_ID "ARMV4I"
# elif _M_ARM == 5
# define ARCHITECTURE_ID "ARMV5I"
# else
# define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
# endif
# elif defined(_M_MIPS)
# define ARCHITECTURE_ID "MIPS"
# elif defined(_M_SH)
# define ARCHITECTURE_ID "SHx"
# else /* unknown architecture */
# define ARCHITECTURE_ID ""
# endif
#elif defined(__WATCOMC__)
# if defined(_M_I86)
# define ARCHITECTURE_ID "I86"
# elif defined(_M_IX86)
# define ARCHITECTURE_ID "X86"
# else /* unknown architecture */
# define ARCHITECTURE_ID ""
# endif
#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
# if defined(__ICCARM__)
# define ARCHITECTURE_ID "ARM"
# elif defined(__ICCRX__)
# define ARCHITECTURE_ID "RX"
# elif defined(__ICCRH850__)
# define ARCHITECTURE_ID "RH850"
# elif defined(__ICCRL78__)
# define ARCHITECTURE_ID "RL78"
# elif defined(__ICCRISCV__)
# define ARCHITECTURE_ID "RISCV"
# elif defined(__ICCAVR__)
# define ARCHITECTURE_ID "AVR"
# elif defined(__ICC430__)
# define ARCHITECTURE_ID "MSP430"
# elif defined(__ICCV850__)
# define ARCHITECTURE_ID "V850"
# elif defined(__ICC8051__)
# define ARCHITECTURE_ID "8051"
# elif defined(__ICCSTM8__)
# define ARCHITECTURE_ID "STM8"
# else /* unknown architecture */
# define ARCHITECTURE_ID ""
# endif
#elif defined(__ghs__)
# if defined(__PPC64__)
# define ARCHITECTURE_ID "PPC64"
# elif defined(__ppc__)
# define ARCHITECTURE_ID "PPC"
# elif defined(__ARM__)
# define ARCHITECTURE_ID "ARM"
# elif defined(__x86_64__)
# define ARCHITECTURE_ID "x64"
# elif defined(__i386__)
# define ARCHITECTURE_ID "X86"
# else /* unknown architecture */
# define ARCHITECTURE_ID ""
# endif
#elif defined(__clang__) && defined(__ti__)
# if defined(__ARM_ARCH)
# define ARCHITECTURE_ID "ARM"
# else /* unknown architecture */
# define ARCHITECTURE_ID ""
# endif
#elif defined(__TI_COMPILER_VERSION__)
# if defined(__TI_ARM__)
# define ARCHITECTURE_ID "ARM"
# elif defined(__MSP430__)
# define ARCHITECTURE_ID "MSP430"
# elif defined(__TMS320C28XX__)
# define ARCHITECTURE_ID "TMS320C28x"
# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
# define ARCHITECTURE_ID "TMS320C6x"
# else /* unknown architecture */
# define ARCHITECTURE_ID ""
# endif
# elif defined(__ADSPSHARC__)
# define ARCHITECTURE_ID "SHARC"
# elif defined(__ADSPBLACKFIN__)
# define ARCHITECTURE_ID "Blackfin"
#elif defined(__TASKING__)
# if defined(__CTC__) || defined(__CPTC__)
# define ARCHITECTURE_ID "TriCore"
# elif defined(__CMCS__)
# define ARCHITECTURE_ID "MCS"
# elif defined(__CARM__) || defined(__CPARM__)
# define ARCHITECTURE_ID "ARM"
# elif defined(__CARC__)
# define ARCHITECTURE_ID "ARC"
# elif defined(__C51__)
# define ARCHITECTURE_ID "8051"
# elif defined(__CPCP__)
# define ARCHITECTURE_ID "PCP"
# else
# define ARCHITECTURE_ID ""
# endif
#elif defined(__RENESAS__)
# if defined(__CCRX__)
# define ARCHITECTURE_ID "RX"
# elif defined(__CCRL__)
# define ARCHITECTURE_ID "RL78"
# elif defined(__CCRH__)
# define ARCHITECTURE_ID "RH850"
# else
# define ARCHITECTURE_ID ""
# endif
#else
# define ARCHITECTURE_ID
#endif
/* Convert integer to decimal digit literals. */
#define DEC(n) \
('0' + (((n) / 10000000)%10)), \
('0' + (((n) / 1000000)%10)), \
('0' + (((n) / 100000)%10)), \
('0' + (((n) / 10000)%10)), \
('0' + (((n) / 1000)%10)), \
('0' + (((n) / 100)%10)), \
('0' + (((n) / 10)%10)), \
('0' + ((n) % 10))
/* Convert integer to hex digit literals. */
#define HEX(n) \
('0' + ((n)>>28 & 0xF)), \
('0' + ((n)>>24 & 0xF)), \
('0' + ((n)>>20 & 0xF)), \
('0' + ((n)>>16 & 0xF)), \
('0' + ((n)>>12 & 0xF)), \
('0' + ((n)>>8 & 0xF)), \
('0' + ((n)>>4 & 0xF)), \
('0' + ((n) & 0xF))
/* Construct a string literal encoding the version number. */
#ifdef COMPILER_VERSION
char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
/* Construct a string literal encoding the version number components. */
#elif defined(COMPILER_VERSION_MAJOR)
char const info_version[] = {
'I', 'N', 'F', 'O', ':',
'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
COMPILER_VERSION_MAJOR,
# ifdef COMPILER_VERSION_MINOR
'.', COMPILER_VERSION_MINOR,
# ifdef COMPILER_VERSION_PATCH
'.', COMPILER_VERSION_PATCH,
# ifdef COMPILER_VERSION_TWEAK
'.', COMPILER_VERSION_TWEAK,
# endif
# endif
# endif
']','\0'};
#endif
/* Construct a string literal encoding the internal version number. */
#ifdef COMPILER_VERSION_INTERNAL
char const info_version_internal[] = {
'I', 'N', 'F', 'O', ':',
'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
'i','n','t','e','r','n','a','l','[',
COMPILER_VERSION_INTERNAL,']','\0'};
#elif defined(COMPILER_VERSION_INTERNAL_STR)
char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
#endif
/* Construct a string literal encoding the version number components. */
#ifdef SIMULATE_VERSION_MAJOR
char const info_simulate_version[] = {
'I', 'N', 'F', 'O', ':',
's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
SIMULATE_VERSION_MAJOR,
# ifdef SIMULATE_VERSION_MINOR
'.', SIMULATE_VERSION_MINOR,
# ifdef SIMULATE_VERSION_PATCH
'.', SIMULATE_VERSION_PATCH,
# ifdef SIMULATE_VERSION_TWEAK
'.', SIMULATE_VERSION_TWEAK,
# endif
# endif
# endif
']','\0'};
#endif
/* Construct the string literal in pieces to prevent the source from
getting matched. Store it in a pointer rather than an array
because some compilers will just produce instructions to fill the
array rather than assigning a pointer to a static array. */
char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
#define C_STD_99 199901L
#define C_STD_11 201112L
#define C_STD_17 201710L
#define C_STD_23 202311L
#ifdef __STDC_VERSION__
# define C_STD __STDC_VERSION__
#endif
#if !defined(__STDC__) && !defined(__clang__) && !defined(__RENESAS__)
# if defined(_MSC_VER) || defined(__ibmxl__) || defined(__IBMC__)
# define C_VERSION "90"
# else
# define C_VERSION
# endif
#elif C_STD > C_STD_17
# define C_VERSION "23"
#elif C_STD > C_STD_11
# define C_VERSION "17"
#elif C_STD > C_STD_99
# define C_VERSION "11"
#elif C_STD >= C_STD_99
# define C_VERSION "99"
#else
# define C_VERSION "90"
#endif
const char* info_language_standard_default =
"INFO" ":" "standard_default[" C_VERSION "]";
const char* info_language_extensions_default = "INFO" ":" "extensions_default["
#if (defined(__clang__) || defined(__GNUC__) || defined(__xlC__) || \
defined(__TI_COMPILER_VERSION__) || defined(__RENESAS__)) && \
!defined(__STRICT_ANSI__)
"ON"
#else
"OFF"
#endif
"]";
/*--------------------------------------------------------------------------*/
#ifdef ID_VOID_MAIN
void main() {}
#else
# if defined(__CLASSIC_C__)
int main(argc, argv) int argc; char *argv[];
# else
int main(int argc, char* argv[])
# endif
{
int require = 0;
require += info_compiler[argc];
require += info_platform[argc];
require += info_arch[argc];
#ifdef COMPILER_VERSION_MAJOR
require += info_version[argc];
#endif
#if defined(COMPILER_VERSION_INTERNAL) || defined(COMPILER_VERSION_INTERNAL_STR)
require += info_version_internal[argc];
#endif
#ifdef SIMULATE_ID
require += info_simulate[argc];
#endif
#ifdef SIMULATE_VERSION_MAJOR
require += info_simulate_version[argc];
#endif
#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
require += info_cray[argc];
#endif
require += info_language_standard_default[argc];
require += info_language_extensions_default[argc];
(void)argv;
return require;
}
#endif

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
# This file is generated by cmake for dependency checking of the CMakeCache.txt file

View File

@ -1,28 +1,189 @@
/*
* This file comes from the Kyber repo; see the files in kyber/avx2/test or kyber/ref/test for further details.
* pqc-bench harness cycle-count + optional PAPI hardware counter benchmarks.
*
* Usage: <binary> [nspins]
* nspins number of outer loop-spin iterations (default: 1)
*
* Each spin runs all operations with NTESTS inner iterations and prints one
* median/average pair per operation. With WITH_PAPI, additional lines are
* emitted per hardware counter using the same parseable format.
*
* Build flags:
* -DWITH_PAPI link against PAPI and emit hardware counter lines
*/
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include "../kem.h"
#include "../params.h"
#include "../indcpa.h"
#include "../polyvec.h"
#include "../poly.h"
#include "../randombytes.h"
#include <string.h>
#include "kem.h"
#include "params.h"
#include "indcpa.h"
#include "polyvec.h"
#include "poly.h"
#include "randombytes.h"
#include "cpucycles.h"
#include "speed_print.h"
#ifdef WITH_PAPI
#include <papi.h>
#endif
#define NTESTS 1000
uint64_t t[NTESTS];
uint8_t seed[KYBER_SYMBYTES] = {0};
/* ── PAPI instrumentation ───────────────────────────────────────────────── */
#ifdef WITH_PAPI
int main(void)
typedef struct {
int code;
const char *name;
} papi_event_def;
static const papi_event_def DESIRED_EVENTS[] = {
{ PAPI_TOT_INS, "instructions" },
{ PAPI_L1_DCM, "l1_misses" },
{ PAPI_L2_TCM, "l2_misses" },
{ PAPI_L3_TCM, "l3_misses" },
{ PAPI_BR_MSP, "branch_mispreds" },
};
#define MAX_EVENTS ((int)(sizeof(DESIRED_EVENTS) / sizeof(DESIRED_EVENTS[0])))
static int papi_eventset = PAPI_NULL;
static int active_codes[MAX_EVENTS];
static const char *active_names[MAX_EVENTS];
static int n_active = 0;
static int papi_ok = 0; /* set to 1 if init succeeded */
static void papi_init(void) {
int ret;
ret = PAPI_library_init(PAPI_VER_CURRENT);
if (ret != PAPI_VER_CURRENT) {
fprintf(stderr, "PAPI_library_init: %s — hardware counters disabled\n",
PAPI_strerror(ret));
return;
}
if ((ret = PAPI_create_eventset(&papi_eventset)) != PAPI_OK) {
fprintf(stderr, "PAPI_create_eventset: %s — hardware counters disabled\n",
PAPI_strerror(ret));
return;
}
for (int i = 0; i < MAX_EVENTS; i++) {
if (PAPI_query_event(DESIRED_EVENTS[i].code) != PAPI_OK) {
fprintf(stderr, "PAPI: event %s not available on this hardware, skipping\n",
DESIRED_EVENTS[i].name);
continue;
}
ret = PAPI_add_event(papi_eventset, DESIRED_EVENTS[i].code);
if (ret != PAPI_OK) {
fprintf(stderr, "PAPI_add_event(%s): %s — skipping\n",
DESIRED_EVENTS[i].name, PAPI_strerror(ret));
continue;
}
active_codes[n_active] = DESIRED_EVENTS[i].code;
active_names[n_active] = DESIRED_EVENTS[i].name;
n_active++;
}
if (n_active == 0) {
fprintf(stderr, "PAPI: no events could be added — hardware counters disabled\n");
return;
}
if ((ret = PAPI_start(papi_eventset)) != PAPI_OK) {
fprintf(stderr, "PAPI_start: %s — hardware counters disabled\n",
PAPI_strerror(ret));
return;
}
papi_ok = 1;
}
/*
* papi_print print per-call counter values for one (op, counter) pair.
* Both "median" and "average" are set to the same per-call value; the outer
* loop-spin structure gives the aggregation tool a real distribution.
* The IPC line uses a float value multiplied by 1000 for integer storage;
* the analysis tool divides by 1000 to recover IPC.
*/
static void papi_print(const char *op, const char *counter,
long long total, int ntests)
{
unsigned int i;
long long per_call = total / ntests;
printf("%s_%s: \nmedian: %lld per_call\naverage: %lld per_call\n\n",
op, counter, per_call, per_call);
}
/*
* papi_bench read counters around an already-executed NTESTS block.
* Call papi_read_before() immediately before the loop and
* papi_bench_report() immediately after.
*/
static long long _papi_before[MAX_EVENTS];
static long long _papi_after[MAX_EVENTS];
static inline void papi_read_before(void) {
if (papi_ok) PAPI_read(papi_eventset, _papi_before);
}
static void papi_bench_report(const char *op) {
if (!papi_ok) return;
PAPI_read(papi_eventset, _papi_after);
for (int e = 0; e < n_active; e++) {
long long delta = _papi_after[e] - _papi_before[e];
papi_print(op, active_names[e], delta, NTESTS);
}
}
#define PAPI_BEFORE() papi_read_before()
#define PAPI_AFTER(op) papi_bench_report(op)
#else /* !WITH_PAPI */
static inline void papi_init(void) {}
#define PAPI_BEFORE() ((void)0)
#define PAPI_AFTER(op) ((void)0)
#endif /* WITH_PAPI */
/* ── Benchmark helpers ───────────────────────────────────────────────────── */
/*
* BENCH(label, body) time NTESTS executions of body, print results, then
* emit PAPI counter lines if enabled.
*/
#define BENCH(label, body) \
do { \
PAPI_BEFORE(); \
for (unsigned int _i = 0; _i < NTESTS; _i++) { \
t[_i] = cpucycles(); \
body; \
} \
print_results(label ": ", t, NTESTS); \
PAPI_AFTER(label); \
} while (0)
/* ── Main ────────────────────────────────────────────────────────────────── */
static uint64_t t[NTESTS];
static uint8_t seed[KYBER_SYMBYTES] = {0};
int main(int argc, char *argv[])
{
int nspins = 1;
if (argc > 1) {
nspins = atoi(argv[1]);
if (nspins <= 0) {
fprintf(stderr, "usage: %s [nspins]\n", argv[0]);
return 1;
}
}
papi_init();
uint8_t pk[CRYPTO_PUBLICKEYBYTES];
uint8_t sk[CRYPTO_SECRETKEYBYTES];
uint8_t ct[CRYPTO_CIPHERTEXTBYTES];
@ -30,130 +191,74 @@ int main(void)
uint8_t coins32[KYBER_SYMBYTES];
uint8_t coins64[2*KYBER_SYMBYTES];
polyvec matrix[KYBER_K];
poly ap;
poly ap;
randombytes(coins32, KYBER_SYMBYTES);
randombytes(coins64, 2*KYBER_SYMBYTES);
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
gen_matrix(matrix, seed, 0);
}
print_results("gen_a: ", t, NTESTS);
for (int spin = 1; spin <= nspins; spin++) {
printf("Loop spin: %d\n", spin);
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
poly_getnoise_eta1(&ap, seed, 0);
}
print_results("poly_getnoise_eta1: ", t, NTESTS);
BENCH("gen_a",
gen_matrix(matrix, seed, 0));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
poly_getnoise_eta2(&ap, seed, 0);
}
print_results("poly_getnoise_eta2: ", t, NTESTS);
BENCH("poly_getnoise_eta1",
poly_getnoise_eta1(&ap, seed, 0));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
poly_ntt(&ap);
}
print_results("NTT: ", t, NTESTS);
BENCH("poly_getnoise_eta2",
poly_getnoise_eta2(&ap, seed, 0));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
poly_invntt_tomont(&ap);
}
print_results("INVNTT: ", t, NTESTS);
BENCH("NTT",
poly_ntt(&ap));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
polyvec_basemul_acc_montgomery(&ap, &matrix[0], &matrix[1]);
}
print_results("polyvec_basemul_acc_montgomery: ", t, NTESTS);
BENCH("INVNTT",
poly_invntt_tomont(&ap));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
poly_tomsg(ct,&ap);
}
print_results("poly_tomsg: ", t, NTESTS);
BENCH("polyvec_basemul_acc_montgomery",
polyvec_basemul_acc_montgomery(&ap, &matrix[0], &matrix[1]));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
poly_frommsg(&ap,ct);
}
print_results("poly_frommsg: ", t, NTESTS);
BENCH("poly_tomsg",
poly_tomsg(ct, &ap));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
poly_compress(ct,&ap);
}
print_results("poly_compress: ", t, NTESTS);
BENCH("poly_frommsg",
poly_frommsg(&ap, ct));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
poly_decompress(&ap,ct);
}
print_results("poly_decompress: ", t, NTESTS);
BENCH("poly_compress",
poly_compress(ct, &ap));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
polyvec_compress(ct,&matrix[0]);
}
print_results("polyvec_compress: ", t, NTESTS);
BENCH("poly_decompress",
poly_decompress(&ap, ct));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
polyvec_decompress(&matrix[0],ct);
}
print_results("polyvec_decompress: ", t, NTESTS);
BENCH("polyvec_compress",
polyvec_compress(ct, &matrix[0]));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
indcpa_keypair_derand(pk, sk, coins32);
}
print_results("indcpa_keypair: ", t, NTESTS);
BENCH("polyvec_decompress",
polyvec_decompress(&matrix[0], ct));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
indcpa_enc(ct, key, pk, seed);
}
print_results("indcpa_enc: ", t, NTESTS);
BENCH("indcpa_keypair",
indcpa_keypair_derand(pk, sk, coins32));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
indcpa_dec(key, ct, sk);
}
print_results("indcpa_dec: ", t, NTESTS);
BENCH("indcpa_enc",
indcpa_enc(ct, key, pk, seed));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
crypto_kem_keypair_derand(pk, sk, coins64);
}
print_results("kyber_keypair_derand: ", t, NTESTS);
BENCH("indcpa_dec",
indcpa_dec(key, ct, sk));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
crypto_kem_keypair(pk, sk);
}
print_results("kyber_keypair: ", t, NTESTS);
BENCH("kyber_keypair_derand",
crypto_kem_keypair_derand(pk, sk, coins64));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
crypto_kem_enc_derand(ct, key, pk, coins32);
}
print_results("kyber_encaps_derand: ", t, NTESTS);
BENCH("kyber_keypair",
crypto_kem_keypair(pk, sk));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
crypto_kem_enc(ct, key, pk);
}
print_results("kyber_encaps: ", t, NTESTS);
BENCH("kyber_encaps_derand",
crypto_kem_enc_derand(ct, key, pk, coins32));
for(i=0;i<NTESTS;i++) {
t[i] = cpucycles();
crypto_kem_dec(key, ct, sk);
BENCH("kyber_encaps",
crypto_kem_enc(ct, key, pk));
BENCH("kyber_decaps",
crypto_kem_dec(key, ct, sk));
}
print_results("kyber_decaps: ", t, NTESTS);
return 0;
}

Binary file not shown.

View File

@ -0,0 +1,10 @@
op,m512,m768,m1024
INVNTT,1.000,1.000,1.000
basemul,1.000,1.000,1.000
frommsg,1.000,1.000,1.000
NTT,1.000,1.000,1.000
iDec,1.000,1.000,1.000
iEnc,1.000,1.000,1.000
iKeypair,1.000,1.000,1.000
gena,1.000,1.000,1.000
noise,1.000,1.000,0.999
1 op m512 m768 m1024
2 INVNTT 1.000 1.000 1.000
3 basemul 1.000 1.000 1.000
4 frommsg 1.000 1.000 1.000
5 NTT 1.000 1.000 1.000
6 iDec 1.000 1.000 1.000
7 iEnc 1.000 1.000 1.000
8 iKeypair 1.000 1.000 1.000
9 gena 1.000 1.000 1.000
10 noise 1.000 1.000 0.999

View File

@ -0,0 +1,5 @@
op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
frommsg,45.642857142857146,0.0,0.0,49.15384615384615,0.0,0.0,55.38461538461539,0.0,0.0
INVNTT,56.26086956521739,0.0,0.0,52.22826086956522,0.0,0.010869565217390686,50.49514563106796,0.009708737864080774,0.0
basemul,52.04054054054054,0.0,0.7128841169937061,47.577586206896555,0.0,0.0,41.63333333333333,0.0,0.0
NTT,35.526315789473685,0.010526315789476826,2.395032525133054,39.39080459770115,0.44762277951932816,0.0,34.58585858585859,0.010101010101010388,0.3631210059781438
1 op m512_sp m512_elo m512_ehi m768_sp m768_elo m768_ehi m1024_sp m1024_elo m1024_ehi
2 frommsg 45.642857142857146 0.0 0.0 49.15384615384615 0.0 0.0 55.38461538461539 0.0 0.0
3 INVNTT 56.26086956521739 0.0 0.0 52.22826086956522 0.0 0.010869565217390686 50.49514563106796 0.009708737864080774 0.0
4 basemul 52.04054054054054 0.0 0.7128841169937061 47.577586206896555 0.0 0.0 41.63333333333333 0.0 0.0
5 NTT 35.526315789473685 0.010526315789476826 2.395032525133054 39.39080459770115 0.44762277951932816 0.0 34.58585858585859 0.010101010101010388 0.3631210059781438

View File

@ -0,0 +1,10 @@
op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
INVNTT,3.6937872667820737,0.0,0.0001923446816691765,3.6923668525283597,0.0,0.0008062243947173364,186.44660194174756,0.0,0.00970873786408788
basemul,3.209016393442623,6.209637357201814e-05,0.00012419274714359219,3.4479583666933546,0.00013344008540183694,0.00013344008540183694,143.55,0.005555555555559977,0.005555555555531555
frommsg,3.0156494522691704,0.0,0.0,2.676388888888889,0.0,0.0,148.23076923076923,0.0,0.0
NTT,3.691742580076403,0.0010845307227014267,0.0002938583602705158,3.6691004672897196,0.001071270209427766,0.0010718961341775746,126.8989898989899,0.0,1.3050917336631755
iDec,3.5713012771855714,0.00023570612000023416,0.00015086802895014628,3.690161977834612,0.0005032782539924341,0.00046931032063479705,114.75503711558855,0.0010604453870683983,0.0010604453870541874
iEnc,3.084863236932217,0.0001782560024712332,0.00016342197515761825,3.21233254333646,0.00035364887129318845,0.00028601070699840747,30.157900043693072,0.0029733062283590073,0.001753088869445918
iKeypair,3.049990457461021,0.00022319698359352103,0.00019792531427453852,3.207066542768769,0.0006512941219742885,0.0005064778000369863,26.020352541412997,0.0025143592087069067,0.0010972674500919766
gena,2.6965550354099146,0.000484369799391704,0.00048237643023396615,2.7162479142988416,0.0006808616189104555,0.0007206686696927811,12.97504909321936,0.0031123799730270463,0.0032871286177282855
noise,2.977777777777778,0.0,0.0,3.4190382728164868,0.0,0.0033585837650456085,4.070093457943925,0.0,0.0
1 op refnv_sp refnv_elo refnv_ehi ref_sp ref_elo ref_ehi avx2_sp avx2_elo avx2_ehi
2 INVNTT 3.6937872667820737 0.0 0.0001923446816691765 3.6923668525283597 0.0 0.0008062243947173364 186.44660194174756 0.0 0.00970873786408788
3 basemul 3.209016393442623 6.209637357201814e-05 0.00012419274714359219 3.4479583666933546 0.00013344008540183694 0.00013344008540183694 143.55 0.005555555555559977 0.005555555555531555
4 frommsg 3.0156494522691704 0.0 0.0 2.676388888888889 0.0 0.0 148.23076923076923 0.0 0.0
5 NTT 3.691742580076403 0.0010845307227014267 0.0002938583602705158 3.6691004672897196 0.001071270209427766 0.0010718961341775746 126.8989898989899 0.0 1.3050917336631755
6 iDec 3.5713012771855714 0.00023570612000023416 0.00015086802895014628 3.690161977834612 0.0005032782539924341 0.00046931032063479705 114.75503711558855 0.0010604453870683983 0.0010604453870541874
7 iEnc 3.084863236932217 0.0001782560024712332 0.00016342197515761825 3.21233254333646 0.00035364887129318845 0.00028601070699840747 30.157900043693072 0.0029733062283590073 0.001753088869445918
8 iKeypair 3.049990457461021 0.00022319698359352103 0.00019792531427453852 3.207066542768769 0.0006512941219742885 0.0005064778000369863 26.020352541412997 0.0025143592087069067 0.0010972674500919766
9 gena 2.6965550354099146 0.000484369799391704 0.00048237643023396615 2.7162479142988416 0.0006808616189104555 0.0007206686696927811 12.97504909321936 0.0031123799730270463 0.0032871286177282855
10 noise 2.977777777777778 0.0 0.0 3.4190382728164868 0.0 0.0033585837650456085 4.070093457943925 0.0 0.0

View File

@ -0,0 +1,10 @@
op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
INVNTT,4.082526315789473,0.0,0.00021052631579010495,3.7465224111282844,0.0,0.00019319938176209916,210.7826086956522,0.0,0.010869565217376476
basemul,3.2770963704630787,0.0016397780187453748,0.0024627477733942804,3.3996364580628406,0.0,0.0,176.9189189189189,0.0,2.4235468345057427
frommsg,3.0109546165884193,0.0,0.0,3.0109546165884193,0.0,0.0,137.42857142857142,0.0,0.0
NTT,3.6866764275256223,0.002157843972798279,0.0010798700725032084,3.7303703703703706,0.0,0.0011056225164107758,132.52631578947367,0.0,8.934358367829702
iDec,3.742600033957779,0.0006353440528448218,0.00042368257587099833,3.79609644087256,0.0002753054612747441,0.0002753370710646408,133.0543259557344,0.0020120724346099905,0.0020120724346099905
iEnc,3.4432478262438213,0.0002504959891131975,0.00030259771432428195,3.530109117810246,0.00039168308874293345,0.00032646898342836295,35.20992436819775,0.0063094659476519155,0.0011068068622037686
iKeypair,3.1751089014071656,9.92090538622925e-05,0.00021725496542801537,3.351041039836322,0.00032261099326946763,0.0003142150864068327,27.8438,0.005767606478706,0.005769913982796027
gena,2.716878579054644,0.00065187098010977,0.0003882364359895085,2.743237945903567,0.0002940023520188184,0.00046488659667787147,12.781735159817352,0.001369863013698236,0.001369863013698236
noise,3.1366495140080044,0.0017923711508616158,0.0,3.433041301627034,0.0,0.0006257822277846437,4.766290182450043,0.0,0.0041446001586527
1 op refnv_sp refnv_elo refnv_ehi ref_sp ref_elo ref_ehi avx2_sp avx2_elo avx2_ehi
2 INVNTT 4.082526315789473 0.0 0.00021052631579010495 3.7465224111282844 0.0 0.00019319938176209916 210.7826086956522 0.0 0.010869565217376476
3 basemul 3.2770963704630787 0.0016397780187453748 0.0024627477733942804 3.3996364580628406 0.0 0.0 176.9189189189189 0.0 2.4235468345057427
4 frommsg 3.0109546165884193 0.0 0.0 3.0109546165884193 0.0 0.0 137.42857142857142 0.0 0.0
5 NTT 3.6866764275256223 0.002157843972798279 0.0010798700725032084 3.7303703703703706 0.0 0.0011056225164107758 132.52631578947367 0.0 8.934358367829702
6 iDec 3.742600033957779 0.0006353440528448218 0.00042368257587099833 3.79609644087256 0.0002753054612747441 0.0002753370710646408 133.0543259557344 0.0020120724346099905 0.0020120724346099905
7 iEnc 3.4432478262438213 0.0002504959891131975 0.00030259771432428195 3.530109117810246 0.00039168308874293345 0.00032646898342836295 35.20992436819775 0.0063094659476519155 0.0011068068622037686
8 iKeypair 3.1751089014071656 9.92090538622925e-05 0.00021725496542801537 3.351041039836322 0.00032261099326946763 0.0003142150864068327 27.8438 0.005767606478706 0.005769913982796027
9 gena 2.716878579054644 0.00065187098010977 0.0003882364359895085 2.743237945903567 0.0002940023520188184 0.00046488659667787147 12.781735159817352 0.001369863013698236 0.001369863013698236
10 noise 3.1366495140080044 0.0017923711508616158 0.0 3.433041301627034 0.0 0.0006257822277846437 4.766290182450043 0.0 0.0041446001586527

View File

@ -0,0 +1,10 @@
op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
INVNTT,3.9386252045826513,0.00020458265139122744,0.00020458265139122744,4.006659729448491,0.0008336786786200534,0.00020811654526564638,209.2608695652174,0.010869565217404897,0.010869565217376476
basemul,3.306184521797905,0.02605040612313525,0.002795691291897384,3.545207465120493,0.0,0.0,168.67241379310346,0.0,0.0
frommsg,2.6708333333333334,0.0,0.0,3.0093896713615025,0.0,0.0,147.92307692307693,0.0,0.0
NTT,3.6989152741131632,0.0010840900568913625,0.0,3.681645754304056,0.0,0.0,145.02298850574712,1.6479885057471222,0.0
iDec,3.6437147040368125,0.00019424892094210833,0.0003467108483481418,3.800139609964661,0.0003315569175033062,0.00016580015750289334,132.98167938931297,0.001526717557254642,0.003053435114509284
iEnc,3.3056977990451344,0.00017231513226034778,0.00016363191105694952,3.48133030817818,0.00022700732330438456,0.00021029337701561346,32.81504567436862,0.004063512322623808,0.0006448146157964629
iKeypair,3.109574915272049,0.00020791977755951763,0.00025167432332651174,3.2525126922733425,0.00022163529575136565,0.000286955967172986,24.668559816590246,0.0031435406706883384,0.0007294706127538575
gena,2.7088029828997557,0.0007052965244342957,0.0005931348088656918,2.69161485393067,0.0005617516864933059,0.0005061000727368814,10.337667648020936,0.002917034774819527,0.0013902518809292275
noise,3.0886524822695036,0.0,0.0008865248226950229,3.4156862745098038,0.0,0.0009803921568627416,4.639147802929427,0.0,0.0013315579227697327
1 op refnv_sp refnv_elo refnv_ehi ref_sp ref_elo ref_ehi avx2_sp avx2_elo avx2_ehi
2 INVNTT 3.9386252045826513 0.00020458265139122744 0.00020458265139122744 4.006659729448491 0.0008336786786200534 0.00020811654526564638 209.2608695652174 0.010869565217404897 0.010869565217376476
3 basemul 3.306184521797905 0.02605040612313525 0.002795691291897384 3.545207465120493 0.0 0.0 168.67241379310346 0.0 0.0
4 frommsg 2.6708333333333334 0.0 0.0 3.0093896713615025 0.0 0.0 147.92307692307693 0.0 0.0
5 NTT 3.6989152741131632 0.0010840900568913625 0.0 3.681645754304056 0.0 0.0 145.02298850574712 1.6479885057471222 0.0
6 iDec 3.6437147040368125 0.00019424892094210833 0.0003467108483481418 3.800139609964661 0.0003315569175033062 0.00016580015750289334 132.98167938931297 0.001526717557254642 0.003053435114509284
7 iEnc 3.3056977990451344 0.00017231513226034778 0.00016363191105694952 3.48133030817818 0.00022700732330438456 0.00021029337701561346 32.81504567436862 0.004063512322623808 0.0006448146157964629
8 iKeypair 3.109574915272049 0.00020791977755951763 0.00025167432332651174 3.2525126922733425 0.00022163529575136565 0.000286955967172986 24.668559816590246 0.0031435406706883384 0.0007294706127538575
9 gena 2.7088029828997557 0.0007052965244342957 0.0005931348088656918 2.69161485393067 0.0005617516864933059 0.0005061000727368814 10.337667648020936 0.002917034774819527 0.0013902518809292275
10 noise 3.0886524822695036 0.0 0.0008865248226950229 3.4156862745098038 0.0 0.0009803921568627416 4.639147802929427 0.0 0.0013315579227697327

View File

@ -0,0 +1,10 @@
op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
INVNTT,56.26086956521739,0.0,0.0,52.22826086956522,0.0,0.010869565217390686,50.49514563106796,0.009708737864080774,0.0
basemul,52.04054054054054,0.0,0.7128841169937061,47.577586206896555,0.0,0.0,41.63333333333333,0.0,0.0
frommsg,45.642857142857146,0.0,0.0,49.15384615384615,0.0,0.0,55.38461538461539,0.0,0.0
NTT,35.526315789473685,0.010526315789476826,2.395032525133054,39.39080459770115,0.44762277951932816,0.0,34.58585858585859,0.010101010101010388,0.3631210059781438
iDec,35.05030181086519,0.0020120724346099905,0.002012072434602885,34.993893129770996,0.001526717557254642,0.0030534351145021787,31.097560975609756,0.0037115588547180778,0.004241781548248724
iEnc,9.974174506548607,0.0014707072125688114,0.0011068068622019922,9.426007522837184,0.0013889971548284308,0.0005373455131660876,9.38816253823144,0.001122140301749397,0.001223049292088163
iKeypair,8.309,0.0020613877224544552,0.0018621724344871637,7.584462275948312,0.0012591916511350831,0.0003647353063778169,8.113443296049837,0.0015653318677752992,0.0014866204162533592
gena,4.659360730593607,0.00045662100456667076,0.0004566210045657826,3.8406934903500165,0.0009551420262225996,0.0004906771344455052,4.776828000462054,0.0014497812681515398,0.0015659914501355843
noise,1.3883579496090357,0.0,0.0012072677822687616,1.3581890812250332,0.0,0.0,1.1904205607476634,0.001168224299065379,0.0
1 op m512_sp m512_elo m512_ehi m768_sp m768_elo m768_ehi m1024_sp m1024_elo m1024_ehi
2 INVNTT 56.26086956521739 0.0 0.0 52.22826086956522 0.0 0.010869565217390686 50.49514563106796 0.009708737864080774 0.0
3 basemul 52.04054054054054 0.0 0.7128841169937061 47.577586206896555 0.0 0.0 41.63333333333333 0.0 0.0
4 frommsg 45.642857142857146 0.0 0.0 49.15384615384615 0.0 0.0 55.38461538461539 0.0 0.0
5 NTT 35.526315789473685 0.010526315789476826 2.395032525133054 39.39080459770115 0.44762277951932816 0.0 34.58585858585859 0.010101010101010388 0.3631210059781438
6 iDec 35.05030181086519 0.0020120724346099905 0.002012072434602885 34.993893129770996 0.001526717557254642 0.0030534351145021787 31.097560975609756 0.0037115588547180778 0.004241781548248724
7 iEnc 9.974174506548607 0.0014707072125688114 0.0011068068622019922 9.426007522837184 0.0013889971548284308 0.0005373455131660876 9.38816253823144 0.001122140301749397 0.001223049292088163
8 iKeypair 8.309 0.0020613877224544552 0.0018621724344871637 7.584462275948312 0.0012591916511350831 0.0003647353063778169 8.113443296049837 0.0015653318677752992 0.0014866204162533592
9 gena 4.659360730593607 0.00045662100456667076 0.0004566210045657826 3.8406934903500165 0.0009551420262225996 0.0004906771344455052 4.776828000462054 0.0014497812681515398 0.0015659914501355843
10 noise 1.3883579496090357 0.0 0.0012072677822687616 1.3581890812250332 0.0 0.0 1.1904205607476634 0.001168224299065379 0.0

View File

@ -0,0 +1,4 @@
op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
KeyGen,5.351663635391034,0.003951776171514432,0.0036136071694450322,5.515256061277458,0.0010128505412421163,0.0011711084383110304,5.92988426026269,0.009300851394026033,0.008673806818412011
Encaps,5.976169109582211,0.0057508565558670455,0.00541865850737544,6.159967741935484,0.0016760536843927198,0.0019668260454155373,6.374312588912245,0.007289526521085499,0.0062883831365772025
Decaps,7.12829219051115,0.0038254678112616958,0.002336315747572648,7.078920782076425,0.0017374106397927136,0.001435830107824998,6.920672062603092,0.007041626152989089,0.00611276112038972
1 op m512_sp m512_elo m512_ehi m768_sp m768_elo m768_ehi m1024_sp m1024_elo m1024_ehi
2 KeyGen 5.351663635391034 0.003951776171514432 0.0036136071694450322 5.515256061277458 0.0010128505412421163 0.0011711084383110304 5.92988426026269 0.009300851394026033 0.008673806818412011
3 Encaps 5.976169109582211 0.0057508565558670455 0.00541865850737544 6.159967741935484 0.0016760536843927198 0.0019668260454155373 6.374312588912245 0.007289526521085499 0.0062883831365772025
4 Decaps 7.12829219051115 0.0038254678112616958 0.002336315747572648 7.078920782076425 0.0017374106397927136 0.001435830107824998 6.920672062603092 0.007041626152989089 0.00611276112038972

Binary file not shown.

View File

@ -0,0 +1,30 @@
% Figure: cross-param speedup consistency for per-polynomial operations.
\begin{tikzpicture}
\begin{axis}[
pqc bar,
ybar, ymin=0, ymax=70, ytick distance=10,
bar width=6pt,
width=\columnwidth, height=5cm,
symbolic x coords={frommsg,INVNTT,basemul,NTT},
ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
legend style={at={(0.99,0.99)}, anchor=north east, font=\small},
]
\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
col sep=comma]{figures/data/cross_param.csv};
\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
col sep=comma]{figures/data/cross_param.csv};
\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
col sep=comma]{figures/data/cross_param.csv};
\end{axis}
\end{tikzpicture}

View File

@ -0,0 +1,74 @@
% Figure: speedup decomposition — three panels (one per algorithm), log y-axis.
% Data: paper/figures/data/decomp_{mlkem512,768,1024}.csv
\begin{tikzpicture}
\begin{groupplot}[
group style={group size=3 by 1, horizontal sep=1.6cm, ylabels at=edge left},
pqc bar,
ybar, ymode=log, ymin=1, ymax=500,
ytick={1,2,5,10,20,50,100,200},
yticklabels={$1\times$,$2\times$,$5\times$,$10\times$,$20\times$,$50\times$,$100\times$,$200\times$},
yminorticks=true,
width=5.2cm, height=6.5cm,
symbolic x coords={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gena,noise},
xticklabels={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gen\_a,noise},
ylabel={Speedup over \texttt{-O0} ($\times$)},
]
%% ML-KEM-512
\nextgroupplot[title={\mlkemk{512}}, bar width=3.5pt]
\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
col sep=comma]{figures/data/decomp_mlkem512.csv};
\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
col sep=comma]{figures/data/decomp_mlkem512.csv};
\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
col sep=comma]{figures/data/decomp_mlkem512.csv};
%% ML-KEM-768
\nextgroupplot[title={\mlkemk{768}}, ylabel={}, bar width=3.5pt]
\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
col sep=comma]{figures/data/decomp_mlkem768.csv};
\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
col sep=comma]{figures/data/decomp_mlkem768.csv};
\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
col sep=comma]{figures/data/decomp_mlkem768.csv};
%% ML-KEM-1024
\nextgroupplot[title={\mlkemk{1024}}, ylabel={}, bar width=3.5pt,
legend style={at={(1.0,0.99)}, anchor=north east, font=\scriptsize},
legend entries={O3 (no auto-vec), O3 + auto-vec, O3 + hand SIMD}]
\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
col sep=comma]{figures/data/decomp_mlkem1024.csv};
\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
col sep=comma]{figures/data/decomp_mlkem1024.csv};
\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
col sep=comma]{figures/data/decomp_mlkem1024.csv};
\end{groupplot}
\end{tikzpicture}

View File

@ -0,0 +1,34 @@
% Figure: hand-SIMD speedup (ref->avx2), three algorithms overlaid, log y-axis.
\begin{tikzpicture}
\begin{axis}[
pqc bar,
ybar, ymode=log, ymin=1, ymax=100,
ytick={1,2,5,10,20,50},
yticklabels={$1\times$,$2\times$,$5\times$,$10\times$,$20\times$,$50\times$},
yminorticks=true,
bar width=5pt,
width=\textwidth, height=6cm,
symbolic x coords={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gena,noise},
xticklabels={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gen\_a,noise},
ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
legend style={at={(0.01,0.99)}, anchor=north west, font=\small},
]
\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
col sep=comma]{figures/data/hand_simd.csv};
\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
col sep=comma]{figures/data/hand_simd.csv};
\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
col sep=comma]{figures/data/hand_simd.csv};
\end{axis}
\end{tikzpicture}

View File

@ -0,0 +1,30 @@
% Figure: KEM-level end-to-end speedup (supplementary).
\begin{tikzpicture}
\begin{axis}[
pqc bar,
ybar, ymin=0, ymax=9, ytick distance=1,
bar width=8pt,
width=\columnwidth, height=5cm,
symbolic x coords={KeyGen,Encaps,Decaps},
ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
legend style={at={(0.01,0.99)}, anchor=north west, font=\small},
]
\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
col sep=comma]{figures/data/kem_level.csv};
\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
col sep=comma]{figures/data/kem_level.csv};
\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
error bars/.cd, y dir=both, y explicit]
table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
col sep=comma]{figures/data/kem_level.csv};
\end{axis}
\end{tikzpicture}

47
paper/macros.tex Normal file
View File

@ -0,0 +1,47 @@
% ── Shared macros ─────────────────────────────────────────────────────────────
% Algorithm shorthands
\newcommand{\mlkem}{ML-KEM}
\newcommand{\mlkemk}[1]{ML-KEM-#1}
\newcommand{\mldsa}{ML-DSA}
\newcommand{\slhdsa}{SLH-DSA}
% Variant names (monospace)
\newcommand{\varref}{\texttt{ref}}
\newcommand{\varrefnv}{\texttt{refnv}}
\newcommand{\varrefo}{\texttt{refo0}}
\newcommand{\varavx}{\texttt{avx2}}
% Operation shorthand
\newcommand{\op}[1]{\texttt{#1}}
% Speedup formatting: \speedup{45.6}
\newcommand{\speedup}[1]{$#1\times$}
% Phase 2 / future-work placeholder
\newcommand{\phasetwo}[1]{\todo[color=blue!15,caption={Phase 2: #1}]{Phase~2: #1}}
\newcommand{\phasethree}[1]{\todo[color=green!15,caption={Phase 3: #1}]{Phase~3: #1}}
% pgfplots colors (match matplotlib palette)
\definecolor{colRefnv}{HTML}{4C72B0} % blue
\definecolor{colRef}{HTML}{55A868} % green
\definecolor{colAvx}{HTML}{C44E52} % red
\definecolor{colM512}{HTML}{4C72B0}
\definecolor{colM768}{HTML}{55A868}
\definecolor{colM1024}{HTML}{C44E52}
% Shared pgfplots style.
% NOTE: ybar, ymode=log, and bar width CANNOT be used inside \pgfplotsset styles
% due to a pgfkeys namespace issue; apply them inline in each axis instead.
\pgfplotsset{
pqc bar/.style={
ymajorgrids=true,
yminorgrids=true,
grid style={dashed, gray!30},
xtick=data,
x tick label style={rotate=45, anchor=east, font=\small},
legend style={font=\small, at={(0.99,0.99)}, anchor=north east},
error bars/error bar style={line width=0.5pt},
error bars/error mark options={rotate=90, mark size=1.5pt},
},
}

View File

@ -1,13 +1,22 @@
\documentclass[sigconf, nonacm]{acmart}
% ── Packages ──────────────────────────────────────────────────────────────────
\usepackage{booktabs}
\usepackage{microtype}
\usepackage{subcaption}
\usepackage{todonotes}
\usepackage{pgfplots}
\usepackage{pgfplotstable}
\usepgfplotslibrary{groupplots}
\pgfplotsset{compat=1.18}
% ── Metadata (fill in when ready) ────────────────────────────────────────────
\title{SIMD Optimization in Post-Quantum Cryptography:\\
A Micro-Architecture and Energy Analysis}
\input{macros}
% ── Metadata ──────────────────────────────────────────────────────────────────
% NOTE: Title targets Phase 1 (ML-KEM, x86 AVX2).
% Update when Phase 2/3 material (ML-DSA, ARM, energy) is incorporated.
\title{Where Does SIMD Help Post-Quantum Cryptography?\\
A Micro-Architectural Study of ML-KEM on x86 AVX2}
\author{Levi Neuwirth}
\affiliation{%
@ -18,103 +27,30 @@
}
\email{ln@levineuwirth.org}
% ── Abstract ──────────────────────────────────────────────────────────────────
\begin{abstract}
TODO
\input{sections/abstract}
\end{abstract}
\keywords{post-quantum cryptography, ML-KEM, Kyber, SIMD, AVX2, performance
analysis, energy efficiency, micro-architecture}
analysis, micro-architecture, benchmark reproducibility}
% ─────────────────────────────────────────────────────────────────────────────
\begin{document}
\maketitle
% ── 1. Introduction ──────────────────────────────────────────────────────────
\section{Introduction}
\label{sec:intro}
\input{sections/intro}
\input{sections/background}
\input{sections/methodology}
\input{sections/results}
\input{sections/discussion}
\input{sections/related}
\input{sections/conclusion}
TODO
% ── 2. Background ────────────────────────────────────────────────────────────
\section{Background}
\label{sec:background}
\subsection{ML-KEM / Kyber}
TODO: Module-LWE, ring structure, NTT.
\subsection{SIMD on x86-64}
TODO: AVX2 register model, relevant instructions for polynomial arithmetic.
\subsection{Hardware Performance Counters and RAPL}
TODO: perf, PAPI, Intel RAPL energy domains.
% ── 3. Methodology ───────────────────────────────────────────────────────────
\section{Methodology}
\label{sec:methodology}
\subsection{Implementation Variants}
TODO: ref (AVX2 intrinsics), refnv (scalar, no vectorization), refo0 (unoptimized
baseline).
\subsection{Benchmark Harness}
TODO: cycle counter, iteration count, statistical methodology, OSCAR node spec.
\subsection{Hardware Counter Collection}
TODO: PAPI events selected and why.
\subsection{Energy Measurement}
TODO: RAPL pkg + DRAM domains, joules-per-operation derivation.
% ── 4. Results ───────────────────────────────────────────────────────────────
\section{Results}
\label{sec:results}
\subsection{Cycle Counts}
\begin{table}[h]
\caption{Median cycle counts, ML-KEM-512, 10\,000 iterations.}
\label{tab:cycles512}
\begin{tabular}{lrrr}
\toprule
Operation & ref (AVX2) & refnv (scalar) & speedup \\
\midrule
NTT & TODO & TODO & TODO$\times$ \\
INVNTT & TODO & TODO & TODO$\times$ \\
polyvec\_basemul\_acc & TODO & TODO & TODO$\times$ \\
indcpa\_keypair & TODO & TODO & TODO$\times$ \\
indcpa\_enc & TODO & TODO & TODO$\times$ \\
kyber\_encaps & TODO & TODO & TODO$\times$ \\
kyber\_decaps & TODO & TODO & TODO$\times$ \\
\bottomrule
\end{tabular}
\end{table}
\subsection{Hardware Counter Breakdown}
TODO: IPC, cache miss rates, branch mispredictions.
\subsection{Energy Efficiency}
TODO: joules/operation, EDP comparison.
% ── 5. Discussion ────────────────────────────────────────────────────────────
\section{Discussion}
\label{sec:discussion}
TODO: mechanistic explanation of where the speedup comes from.
% ── 6. Related Work ──────────────────────────────────────────────────────────
\section{Related Work}
\label{sec:related}
TODO
% ── 7. Conclusion ────────────────────────────────────────────────────────────
\section{Conclusion}
\label{sec:conclusion}
TODO
% ── References ───────────────────────────────────────────────────────────────
\bibliographystyle{ACM-Reference-Format}
\bibliography{refs}
\appendix
\input{sections/supplementary}
\end{document}

View File

@ -42,7 +42,7 @@
@misc{kyber-avx2,
author = {Schwabe, Peter and Seiler, Gregor},
title = {{Better Bootstrapping in Fully Homomorphic Encryption}},
title = {{High-Speed {AVX2} Implementation of the {Kyber} Key Encapsulation Mechanism}},
note = {AVX2 implementation in the pqclean project},
url = {https://github.com/pq-crystals/kyber},
}
@ -97,3 +97,45 @@
title = {{pqm4: Post-quantum crypto library for the ARM Cortex-M4}},
url = {https://github.com/mupq/pqm4},
}
@misc{supercop,
author = {Bernstein, Daniel J. and Lange, Tanja},
title = {{SUPERCOP: System for Unified Performance Evaluation Related to
Cryptographic Operations and Primitives}},
url = {https://bench.cr.yp.to/supercop.html},
}
@misc{papi,
author = {{Innovative Computing Laboratory, University of Tennessee}},
title = {{PAPI: Performance Application Programming Interface}},
url = {https://icl.utk.edu/papi/},
}
@inproceedings{gueron2014,
author = {Gueron, Shay and Krasnov, Vlad},
title = {{Fast Garbling of Circuits Under Standard Assumptions}},
booktitle = {ACM CCS},
year = {2013},
note = {See also: Intel white paper on AES-GCM with AVX2},
}
@misc{bernstein2006,
author = {Bernstein, Daniel J.},
title = {{Curve25519: new Diffie-Hellman speed records}},
year = {2006},
url = {https://cr.yp.to/ecdh.html},
}
@misc{cachetime,
author = {Bernstein, Daniel J. and Schwabe, Peter},
title = {{New AES Software Speed Records}},
year = {2008},
url = {https://cr.yp.to/aes-speed.html},
}
@misc{bettini2024,
author = {{Google Security Blog}},
title = {{Protecting Chrome Traffic with Hybrid Kyber KEM}},
year = {2023},
url = {https://security.googleblog.com/2023/08/protecting-chrome-traffic-with-hybrid.html},
}

View File

@ -0,0 +1,31 @@
Post-quantum cryptography (PQC) standards are being deployed at scale following
NIST's 2024 finalization of \mlkem{} (FIPS~203), \mldsa{} (FIPS~204), and
\slhdsa{} (FIPS~205). Hand-written SIMD implementations of these algorithms
report dramatic performance advantages, yet the mechanistic origins of these
speedups are rarely quantified with statistical rigor.
We present the first systematic empirical decomposition of SIMD speedup across
the operations of \mlkem{} (Kyber) on Intel x86-64 with AVX2. Using a
reproducible benchmark harness across four compilation variants---\varrefo{}
(unoptimized), \varrefnv{} (O3, auto-vectorization disabled), \varref{}
(O3 with auto-vectorization), and \varavx{} (hand-written AVX2 intrinsics)---we
isolate three distinct contributions: compiler optimization, compiler
auto-vectorization, and hand-written SIMD. All measurements are conducted on a
pinned core of an Intel Xeon Platinum 8268 on Brown University's OSCAR HPC
cluster, with statistical significance assessed via Mann-Whitney U tests and
Cliff's~$\delta$ effect-size analysis across $n \ge 2{,}000$ independent
observations per group.
Our key findings are: (1) hand-written AVX2 assembly accounts for
\speedup{35}--\speedup{56} speedup over compiler-optimized C for the dominant
arithmetic operations (NTT, INVNTT, base multiplication), with Cliff's
$\delta = +1.000$ in every comparison---meaning AVX2 is faster in
\emph{every single} observation pair; (2) GCC's auto-vectorizer contributes
negligibly or even slightly negatively for NTT-based operations because the
modular reduction step prevents vectorization; (3) end-to-end KEM speedups of
\speedup{5.4}--\speedup{7.1} result from a weighted combination of large
per-operation gains and smaller gains in SHAKE-heavy operations (gen\_a:
\speedup{3.8}--\speedup{4.7}; noise sampling: \speedup{1.2}--\speedup{1.4}).
The benchmark harness, raw data, and analysis pipeline are released as an open
reproducible artifact.

View File

@ -0,0 +1,88 @@
% ── 2. Background ─────────────────────────────────────────────────────────────
\section{Background}
\label{sec:background}
\subsection{ML-KEM and the Number Theoretic Transform}
\mlkem{}~\cite{fips203} is a key encapsulation mechanism built on the
Module-Learning-With-Errors (Module-LWE) problem. Its security parameter
$k \in \{2, 3, 4\}$ controls the module dimension, yielding the three
instantiations \mlkemk{512}, \mlkemk{768}, and \mlkemk{1024}. The scheme
operates on polynomials in $\mathbb{Z}_q[x]/(x^{256}+1)$ with $q = 3329$.
The computational core is polynomial multiplication, which \mlkem{} evaluates
using the Number Theoretic Transform (NTT)~\cite{ntt-survey}. The NTT is a
modular analog of the Fast Fourier Transform that reduces schoolbook
$O(n^2)$ polynomial multiplication to $O(n \log n)$ pointwise operations.
For $n = 256$ coefficients and $q = 3329$, the NTT can be computed using a
specialized radix-2 Cooley-Tukey butterfly operating over 128 size-2 NTTs
in the NTT domain.
The primitive operations benchmarked in this paper are:
\begin{itemize}
\item \op{NTT} / \op{INVNTT}: forward and inverse NTT over a single
polynomial ($n = 256$).
\item \op{basemul}: pointwise multiplication in the NTT domain (base
multiplication of two NTT-domain polynomials).
\item \op{poly\_frommsg}: encodes a 32-byte message into a polynomial.
\item \op{gen\_a}: generates the public matrix $\mathbf{A}$ by expanding
a seed with SHAKE-128.
\item \op{poly\_getnoise\_eta\{1,2\}}: samples a centered binomial
distribution (CBD) noise polynomial using SHAKE-256 output.
\item \op{indcpa\_\{keypair, enc, dec\}}: full IND-CPA key generation,
encryption, and decryption.
\end{itemize}
\subsection{AVX2 SIMD on x86-64}
Intel's Advanced Vector Extensions 2 (AVX2) extends the YMM register file to
256-bit width, accommodating sixteen 16-bit integers simultaneously. The
\mlkem{} AVX2 implementation~\cite{kyber-avx2} by Schwabe and Seiler uses
hand-written assembly intrinsics rather than compiler-generated vectorized code.
The key instruction patterns exploited are:
\begin{itemize}
\item \texttt{vpaddw} / \texttt{vpsubw}: packed 16-bit addition/subtraction,
operating on 16 coefficients per instruction.
\item \texttt{vpmullw} / \texttt{vpmulhw}: packed 16-bit low/high multiply,
used to implement 16-wide Montgomery reduction.
\item \texttt{vpunpcklwd} / \texttt{vpunpckhwd}: interleave operations for
the NTT butterfly shuffle pattern.
\end{itemize}
Because \mlkem{} coefficients are 16-bit integers and the NTT butterfly
operates independently on 16 coefficient pairs per round, AVX2 offers a
theoretical $16\times$ instruction-count reduction for arithmetic steps. As
\S\ref{sec:results} shows, observed speedups \emph{exceed} $16\times$ for
\op{INVNTT} and \op{basemul} due to additional instruction-level parallelism
(ILP) in the unrolled hand-written loops.
\subsection{Compilation Variants}
To isolate distinct sources of speedup, we define four compilation variants
(detailed in §\ref{sec:methodology}):
\begin{description}
\item[\varrefo{}] Compiled at \texttt{-O0}: no optimization. Serves as the
unoptimized baseline.
\item[\varrefnv{}] Compiled at \texttt{-O3 -fno-tree-vectorize}: full
compiler optimization but with auto-vectorization disabled. Isolates
the contribution of general compiler optimizations (register
allocation, loop unrolling, constant propagation) from SIMD.
\item[\varref{}] Compiled at \texttt{-O3}: full optimization including GCC's
auto-vectorizer. Represents what production deployments without
hand-tuned SIMD would achieve.
\item[\varavx{}] Hand-written AVX2 assembly: the production-quality
optimized implementation.
\end{description}
\subsection{Hardware Performance Counters and Energy}
\label{sec:bg:papi}
\phasetwo{Expand with PAPI and RAPL background once data is collected.}
Hardware performance counters (accessed via PAPI~\cite{papi} or Linux
\texttt{perf\_event}) allow measuring IPC, cache miss rates, and branch
mispredictions at the instruction level. Intel RAPL~\cite{rapl} provides
package- and DRAM-domain energy readings. These will be incorporated in
Phase~2 to provide a mechanistic hardware-level explanation complementing the
cycle-count analysis presented here.

View File

@ -0,0 +1,46 @@
% ── 7. Conclusion ─────────────────────────────────────────────────────────────
\section{Conclusion}
\label{sec:conclusion}
We presented the first statistically rigorous decomposition of SIMD speedup
in \mlkem{} (Kyber), isolating the contributions of compiler optimization,
auto-vectorization, and hand-written AVX2 assembly. Our main findings are:
\begin{enumerate}
\item \textbf{Hand-written SIMD is necessary, not optional.} GCC's
auto-vectorizer provides negligible benefit ($<10\%$) for NTT-based
arithmetic, and for \op{INVNTT} actually produces slightly slower code
than non-vectorized O3. The full \speedup{35}--\speedup{56} speedup
on arithmetic operations comes entirely from hand-written assembly.
\item \textbf{The distribution of SIMD benefit across operations is
highly non-uniform.} Arithmetic operations (NTT, INVNTT, basemul,
frommsg) achieve \speedup{35}--\speedup{56}; SHAKE-based expansion
(gen\_a) achieves only \speedup{3.8}--\speedup{4.7}; and noise
sampling achieves \speedup{1.2}--\speedup{1.4}. The bottleneck shifts
from compute to memory bandwidth for non-arithmetic operations.
\item \textbf{The statistical signal is overwhelming.} Cliff's $\delta =
+1.000$ for nearly all operations means AVX2 is faster than \varref{}
in every single observation pair across $n \ge 2{,}000$ measurements.
These results are stable across three \mlkem{} parameter sets.
\item \textbf{Context affects even isolated micro-benchmarks.} The NTT
speedup varies by 13\% across parameter sets despite identical
polynomial dimensions, attributed to cache-state effects from
surrounding polyvec operations.
\end{enumerate}
\paragraph{Future work.}
Planned extensions include: hardware performance counter profiles (IPC, cache
miss rates) via PAPI to validate the mechanistic explanations in
§\ref{sec:discussion}; energy measurement via Intel RAPL; extension to
\mldsa{} (Dilithium) and \slhdsa{} (SPHINCS+) with the same harness; and
cross-ISA comparison with ARM NEON/SVE (Graviton3) and RISC-V V. A compiler
version sensitivity study (GCC 11--14, Clang 14--17) will characterize how
stable the auto-vectorization gap is across compiler releases.
\paragraph{Artifact.}
The benchmark harness, SLURM job templates, raw cycle-count data, analysis
pipeline, and this paper are released at
\url{https://github.com/lneuwirth/where-simd-helps} under an open license.

View File

@ -0,0 +1,104 @@
% ── 5. Discussion ─────────────────────────────────────────────────────────────
\section{Discussion}
\label{sec:discussion}
\subsection{Why Arithmetic Operations Benefit Most}
The NTT butterfly loop processes 128 pairs of 16-bit coefficients per forward
transform. In the scalar \varref{} path, each butterfly requires a modular
multiplication (implemented as a Barrett reduction), an addition, and a
subtraction---roughly 10--15 instructions per pair with data-dependent
serialization through the multiply-add chain. The AVX2 path uses
\texttt{vpmullw}/\texttt{vpmulhw} to compute 16 Montgomery multiplications
per instruction, processing an entire butterfly layer in \mbox{$\sim$16}
fewer instruction cycles.
The observed INVNTT speedup of \speedup{56.3} at \mlkemk{512} \emph{exceeds}
the theoretical $16\times$ register-width advantage. We attribute this to
two compounding factors: (1) the unrolled hand-written assembly eliminates
loop overhead and branch prediction pressure; (2) the inverse NTT has a
slightly different access pattern than the forward NTT that benefits from
out-of-order execution with wide issue ports on the Cascade Lake
microarchitecture. \phasetwo{Confirm with IPC and port utilisation counters.}
\subsection{Why the Compiler Cannot Auto-Vectorise NTT}
A striking result is that \varref{} and \varrefnv{} perform nearly identically
for all arithmetic operations ($\leq 10\%$ difference, with \varrefnv{}
occasionally faster). This means GCC's tree-vectorizer produces no net benefit
for the NTT inner loop.
The fundamental obstacle is \emph{modular reduction}: Barrett reduction and
Montgomery reduction require a multiply-high operation (\texttt{vpmulhw}) that
GCC cannot express through the scalar multiply-add chain it generates for the
C reference code. Additionally, the NTT butterfly requires coefficient
interleaving (odd/even index separation) that the auto-vectorizer does not
recognize as a known shuffle pattern. The hand-written assembly encodes these
patterns directly in \texttt{vpunpck*} instructions.
This finding has practical significance: developers porting \mlkem{} to new
platforms cannot rely on the compiler to provide SIMD speedup for the NTT.
Hand-written intrinsics or architecture-specific assembly are necessary.
\subsection{Why SHAKE Operations Benefit Less}
\op{gen\_a} expands a public seed into a $k \times k$ matrix of polynomials
using SHAKE-128. Each Keccak-f[1600] permutation operates on a 200-byte state
that does not fit in AVX2 registers (16 lanes $\times$ 16 bits = 32 bytes). The
AVX2 Keccak implementation achieves \speedup{3.8}--\speedup{4.7} primarily by
batching multiple independent absorb phases and using vectorized XOR across
parallel state words---a different kind of SIMD parallelism than the arithmetic
path. The bottleneck shifts to memory bandwidth as the permutation state is
repeatedly loaded from and stored to L1 cache.
\subsection{Why Noise Sampling Barely Benefits}
CBD noise sampling reads adjacent bits from a byte stream and computes
Hamming weights. The scalar path already uses bitwise operations with no
data-dependent branches (constant-time design). The AVX2 path can batch the
popcount computation but remains bottlenecked by the sequential bitstream
access pattern. The small \speedup{1.2}--\speedup{1.4} speedup reflects
this fundamental memory access bottleneck rather than compute limitation.
\subsection{NTT Cache-State Variation Across Parameter Sets}
The \speedup{13\%} variation in NTT speedup across parameter sets
\ref{sec:results:crossparams}) despite identical polynomial dimensions
suggests that execution context matters even for nominally isolated
micro-benchmarks. Higher-$k$ polyvec operations that precede each NTT call
have larger memory footprints ($k$ more polynomials in the accumulation
buffer), potentially evicting portions of the instruction cache or L1 data
cache that the scalar NTT path relies on. The AVX2 path is less affected
because it maintains more coefficient state in vector registers between
operations. \phasetwo{Verify with L1/L2 miss counters split by scalar vs AVX2.}
\subsection{Implications for Deployment}
The end-to-end KEM speedups of \speedup{5.4}--\speedup{7.1} (Appendix,
Figure~\ref{fig:kemlevel}) represent the practical deployment benefit.
Deployments that cannot use hand-written SIMD (e.g., some constrained
environments, or languages without inline assembly support) should expect
performance within a factor of $5$--$7$ of the AVX2 reference.
Auto-vectorization provides essentially no shortcut: the gap between
compiler-optimized C and hand-written SIMD is the full $5$--$7\times$, not
a fraction of it.
\subsection{Limitations}
\paragraph{No hardware counter data (Phase~1).} The mechanistic explanations
in this section are derived analytically from instruction-set structure and
publicly known microarchitecture details. Phase~2 will validate these with
PAPI counter measurements. \phasetwo{PAPI counters: IPC, cache miss rates.}
\paragraph{Single microarchitecture.} All results are from Intel Cascade Lake
(Xeon Platinum 8268). Speedup ratios may differ on other AVX2 hosts (e.g.,
Intel Skylake, AMD Zen 3/4) due to differences in execution port configuration,
vector throughput, and out-of-order window size.
\phasethree{Repeat on AMD Zen, ARM Graviton3, RISC-V.}
\paragraph{Frequency scaling.} OSCAR nodes may operate in a power-capped mode
that reduces Turbo Boost frequency under sustained SIMD load. RDTSC counts
wall-clock ticks at the invariant TSC frequency, which may differ from the
actual core frequency during SIMD execution.
\phasetwo{Characterize frequency during benchmarks; consider RAPL-normalized
cycle counts.}

51
paper/sections/intro.tex Normal file
View File

@ -0,0 +1,51 @@
% ── 1. Introduction ───────────────────────────────────────────────────────────
\section{Introduction}
\label{sec:intro}
The 2024 NIST post-quantum cryptography standards~\cite{fips203,fips204,fips205}
mark a turning point in deployed cryptography. \mlkem{} (Module-Lattice Key
Encapsulation Mechanism, FIPS~203) is already being integrated into TLS~1.3 by
major browser vendors~\cite{bettini2024} and is planned for inclusion in OpenSSH.
At deployment scale, performance matters: a server handling thousands of TLS
handshakes per second experiences a non-trivial computational overhead from
replacing elliptic-curve key exchange with a lattice-based KEM.
Reference implementations of \mlkem{} ship with hand-optimized AVX2 assembly
for the dominant operations~\cite{kyber-avx2}. Benchmarks routinely report
that the AVX2 path is ``$5$--$7\times$ faster'' than the portable C reference.
However, such top-level numbers conflate several distinct phenomena:
compiler optimization, compiler auto-vectorization, and hand-written SIMD. They
also say nothing about \emph{which} operations drive the speedup or \emph{why}
the assembly is faster than what a compiler can produce automatically.
\subsection*{Contributions}
This paper makes the following contributions:
\begin{enumerate}
\item \textbf{Three-way speedup decomposition.} We isolate compiler
optimization, auto-vectorization, and hand-written SIMD as separate
factors using four compilation variants (§\ref{sec:methodology}).
\item \textbf{Statistically rigorous benchmarking.} All comparisons are
backed by Mann-Whitney U tests and Cliff's~$\delta$ effect-size
analysis over $n \ge 2{,}000$ independent observations, with
bootstrapped 95\% confidence intervals on speedup ratios
\ref{sec:results}).
\item \textbf{Mechanistic analysis without hardware counters.} We explain
the quantitative speedup pattern analytically from the structure of
the NTT butterfly, Montgomery multiplication, and the SHAKE-128
permutation (§\ref{sec:discussion}).
\item \textbf{Open reproducible artifact.} The full pipeline from raw
SLURM outputs to publication figures is released publicly.
\end{enumerate}
\subsection*{Scope and roadmap}
This report covers Phase~1 of a broader study: \mlkem{} on Intel x86-64 with
AVX2. Planned extensions include hardware performance counter profiles (PAPI),
energy measurement (Intel RAPL), extension to \mldsa{} (Dilithium), and
cross-ISA comparison with ARM NEON/SVE and RISC-V V. Those results will be
incorporated in subsequent revisions.

View File

@ -0,0 +1,105 @@
% ── 3. Methodology ────────────────────────────────────────────────────────────
\section{Methodology}
\label{sec:methodology}
\subsection{Implementation Source}
We use the \mlkem{} reference implementation from the \texttt{pq-crystals/kyber}
repository~\cite{kyber-avx2}, which provides both a portable C reference
(\varref{} / \varrefnv{}) and hand-written AVX2 assembly (\varavx{}). The
implementation targets the CRYSTALS-Kyber specification, functionally identical
to FIPS~203.
\subsection{Compilation Variants}
\label{sec:meth:variants}
We compile the same C source under four variant configurations using GCC 13.3.0:
\begin{description}
\item[\varrefo{}] \texttt{-O0}: unoptimized. Every operation is loaded/stored
through memory; no inlining, no register allocation. Establishes a
reproducible performance floor.
\item[\varrefnv{}] \texttt{-O3 -fno-tree-vectorize}: aggressive scalar
optimization but with the tree-vectorizer disabled. Isolates the
auto-vectorization contribution from general O3 optimizations.
\item[\varref{}] \texttt{-O3}: full optimization with GCC auto-vectorization
enabled. Represents realistic scalar-C performance.
\item[\varavx{}] \texttt{-O3} with hand-written AVX2 assembly linked in:
the production optimized path.
\end{description}
All four variants are built with position-independent code and identical linker
flags. The AVX2 assembly sources use the same \texttt{KYBER\_NAMESPACE} macro
as the C sources to prevent symbol collisions.
\subsection{Benchmark Harness}
Each binary runs a \emph{spin loop}: $N = 1{,}000$ outer iterations (spins),
each performing 20~repetitions of the target operation followed by a median
and mean cycle count report via \texttt{RDTSC}. Using the median of 20
repetitions per spin suppresses within-spin outliers; collecting 1{,}000 spins
produces a distribution of 1{,}000 median observations per binary invocation.
Two independent job submissions per (algorithm, variant) pair yield
$n \ge 2{,}000$ independent observations per group (3{,}000 for \varref{} and
\varavx{}, which had a third clean run). All runs used \texttt{taskset} to pin
to a single logical core, preventing OS scheduling interference.
\subsection{Hardware Platform}
All benchmarks were conducted on Brown University's OSCAR HPC cluster, node
\texttt{node2334}, pinned via SLURM's \texttt{{-}{-}nodelist} directive to
ensure all variants measured on identical hardware. The node specifications are:
\begin{center}
\small
\begin{tabular}{ll}
\toprule
CPU model & Intel Xeon Platinum 8268 (Cascade Lake) \\
Clock speed & 2.90\,GHz base \\
ISA extensions & SSE4.2, AVX, AVX2, AVX-512F \\
L1D cache & 32\,KB (per core) \\
L2 cache & 1\,MB (per core) \\
L3 cache & 35.75\,MB (shared) \\
OS & Linux (kernel 3.10) \\
Compiler & GCC 13.3.0 \\
\bottomrule
\end{tabular}
\end{center}
\noindent\textbf{Reproducibility note:} The \texttt{perf\_event\_paranoid}
setting on OSCAR nodes is 2, which prevents unprivileged access to hardware
performance counters. Hardware counter data (IPC, cache miss rates) will be
collected in Phase~2 after requesting elevated permissions from the cluster
administrators. \phasetwo{Hardware counter collection via PAPI.}
\subsection{Statistical Methodology}
\label{sec:meth:stats}
Cycle count distributions are right-skewed with occasional outliers from
OS interrupts and cache-cold starts (Figure~\ref{fig:distributions}). We
therefore use nonparametric statistics throughout:
\begin{itemize}
\item \textbf{Speedup}: ratio of group medians, $\hat{s} =
\text{median}(X_\text{baseline}) / \text{median}(X_\text{variant})$.
\item \textbf{Confidence interval}: 95\% bootstrap CI on $\hat{s}$,
computed by resampling both groups independently $B = 5{,}000$ times
with replacement.
\item \textbf{Mann-Whitney U test}: one-sided test for the hypothesis that
the variant distribution is stochastically smaller than the baseline
($H_1: P(X_\text{variant} < X_\text{baseline}) > 0.5$).
\item \textbf{Cliff's $\delta$}: effect size defined as $\delta =
[P(X_\text{variant} < X_\text{baseline}) -
P(X_\text{variant} > X_\text{baseline})]$, derived from the
Mann-Whitney U statistic. $\delta = +1$ indicates that
\emph{every} variant observation is faster than \emph{every}
baseline observation.
\end{itemize}
\subsection{Energy Measurement}
\label{sec:meth:energy}
\phasetwo{Intel RAPL (pkg + DRAM domains), EDP computation, per-operation joules.}
Energy measurements via Intel RAPL will be incorporated in Phase~2. The harness
already includes conditional RAPL support (\texttt{-DWITH\_RAPL=ON}) pending
appropriate system permissions.

View File

@ -0,0 +1,41 @@
% ── 6. Related Work ───────────────────────────────────────────────────────────
\section{Related Work}
\label{sec:related}
\paragraph{ML-KEM / Kyber implementations.}
The AVX2 implementation studied here was developed by Schwabe and
Seiler~\cite{kyber-avx2} and forms the optimized path in both the
\texttt{pq-crystals/kyber} reference repository and
PQClean~\cite{pqclean}. Bos et al.~\cite{kyber2018} describe the original
Kyber submission; FIPS~203~\cite{fips203} is the standardized form.
The ARM NEON and Cortex-M4 implementations are available in
pqm4~\cite{pqm4}; cross-ISA comparison is planned for Phase~3.
\paragraph{PQC benchmarking.}
eBACS/SUPERCOP provides a cross-platform benchmark suite~\cite{supercop} that
reports median cycle counts for many cryptographic primitives, including Kyber.
Our contribution complements this with a statistically rigorous decomposition
using nonparametric effect-size analysis and bootstrapped CIs. Kannwischer et
al.~\cite{pqm4} present systematic benchmarks on ARM Cortex-M4 (pqm4), which
focuses on constrained-device performance rather than SIMD analysis.
\paragraph{SIMD in cryptography.}
Gueron and Krasnov demonstrated AVX2 speedups for AES-GCM~\cite{gueron2014};
similar techniques underpin the Kyber AVX2 implementation. Bernstein's
vectorized polynomial arithmetic for Curve25519~\cite{bernstein2006} established
the template of hand-written vector intrinsics for cryptographic field
arithmetic.
\paragraph{NTT optimization.}
Longa and Naehrig~\cite{ntt-survey} survey NTT algorithms for ideal
lattice-based cryptography and analyze instruction counts for vectorized
implementations. Our measurements provide the first empirical cycle-count
decomposition isolating the compiler's contribution vs.\ hand-written SIMD for
the ML-KEM NTT specifically.
\paragraph{Hardware counter profiling.}
Bernstein and Schwabe~\cite{cachetime} discuss the relationship between cache
behavior and cryptographic timing. PAPI~\cite{papi} provides a portable
interface to hardware performance counters used in related profiling work.
Phase~2 of this study will add PAPI counter collection to provide the
mechanistic hardware-level explanation of the speedups observed here.

181
paper/sections/results.tex Normal file
View File

@ -0,0 +1,181 @@
% ── 4. Results ────────────────────────────────────────────────────────────────
\section{Results}
\label{sec:results}
\subsection{Cycle Count Distributions}
\label{sec:results:distributions}
Figure~\ref{fig:distributions} shows the cycle count distributions for three
representative operations in \mlkemk{512}, comparing \varref{} and \varavx{}.
All distributions are right-skewed with a long tail from OS interrupts and
cache-cold executions. The median (dashed lines) is robust to these outliers,
justifying the nonparametric approach of §\ref{sec:meth:stats}.
The separation between \varref{} and \varavx{} is qualitatively different
across operation types: for \op{INVNTT} the distributions do not overlap at
all (disjoint spikes separated by two orders of magnitude on the log scale);
for \op{gen\_a} there is partial overlap; for noise sampling the distributions
are nearly coincident.
\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{figures/distributions.pdf}
\caption{Cycle count distributions for three representative \mlkemk{512}
operations. Log $x$-axis. Dashed lines mark medians. Right-skew and
outlier structure motivate nonparametric statistics.}
\label{fig:distributions}
\end{figure}
\subsection{Speedup Decomposition}
\label{sec:results:decomp}
Figure~\ref{fig:decomp} shows the cumulative speedup at each optimization stage
for all three \mlkem{} parameter sets. Each group of bars represents one
operation; the three bars within a group show the total speedup achieved after
applying (i)~O3 without auto-vec (\varrefnv{}), (ii)~O3 with auto-vec
(\varref{}), and (iii)~hand-written AVX2 (\varavx{})---all normalized to the
unoptimized \varrefo{} baseline. The log scale makes the three orders of
magnitude of variation legible.
Several structural features are immediately apparent:
\begin{itemize}
\item The \varrefnv{} and \varref{} bars are nearly indistinguishable for
arithmetic operations (NTT, INVNTT, basemul, frommsg), confirming that
GCC's auto-vectorizer contributes negligibly to these operations.
\item The \varavx{} bars are 1--2 orders of magnitude taller than the
\varref{} bars for arithmetic operations, indicating that hand-written
SIMD dominates the speedup.
\item For SHAKE-heavy operations (gen\_a, noise), all three bars are much
closer together, reflecting the memory-bandwidth bottleneck that limits
SIMD benefit.
\end{itemize}
\begin{figure*}[t]
\centering
\input{figures/fig_decomp}
\caption{Cumulative speedup at each optimization stage, normalized to
\varrefo{} (1×). Three bars per operation:
\textcolor{colRefnv}{$\blacksquare$}~O3 no auto-vec,
\textcolor{colRef}{$\blacksquare$}~O3 + auto-vec,
\textcolor{colAvx}{$\blacksquare$}~O3 + hand SIMD (AVX2).
Log $y$-axis; 95\% bootstrap CI shown on \varavx{} bars.
Sorted by \varavx{} speedup.}
\label{fig:decomp}
\end{figure*}
\subsection{Hand-Written SIMD Speedup}
\label{sec:results:simd}
Figure~\ref{fig:handsimd} isolates the hand-written SIMD speedup (\varref{}
$\to$ \varavx{}) across all three \mlkem{} parameter sets. Table~\ref{tab:simd}
summarizes the numerical values.
Key observations:
\begin{itemize}
\item \textbf{Arithmetic operations} achieve the largest speedups:
\speedup{56.3} for \op{INVNTT} at \mlkemk{512}, \speedup{52.0} for
\op{basemul}, and \speedup{45.6} for \op{frommsg}. The 95\% bootstrap
CIs on these ratios are extremely tight (often $[\hat{s}, \hat{s}]$ to
two decimal places), reflecting near-perfect measurement stability.
\item \textbf{gen\_a} achieves \speedup{3.8}--\speedup{4.7}: substantially
smaller than arithmetic operations because SHAKE-128 generation is
memory-bandwidth limited.
\item \textbf{Noise sampling} achieves only \speedup{1.2}--\speedup{1.4},
the smallest SIMD benefit. The centered binomial distribution (CBD)
sampler is bit-manipulation-heavy with sequential bitstream reads that
do not parallelise well.
\item Speedups are broadly consistent across parameter sets for per-polynomial
operations, as expected (§\ref{sec:results:crossparams}).
\end{itemize}
\begin{figure*}[t]
\centering
\input{figures/fig_hand_simd}
\caption{Hand-written SIMD speedup (\varref{} $\to$ \varavx{}) per operation,
across all three \mlkem{} parameter sets. Log $y$-axis.
95\% bootstrap CI error bars (often sub-pixel).
Sorted by \mlkemk{512} speedup.}
\label{fig:handsimd}
\end{figure*}
\begin{table}[t]
\caption{Hand-written SIMD speedup (\varref{} $\to$ \varavx{}), median ratio
with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$.}
\label{tab:simd}
\small
\begin{tabular}{lccc}
\toprule
Operation & \mlkemk{512} & \mlkemk{768} & \mlkemk{1024} \\
\midrule
\op{INVNTT} & $56.3\times$ & $52.2\times$ & $50.5\times$ \\
\op{basemul} & $52.0\times$ & $47.6\times$ & $41.6\times$ \\
\op{frommsg} & $45.6\times$ & $49.2\times$ & $55.4\times$ \\
\op{NTT} & $35.5\times$ & $39.4\times$ & $34.6\times$ \\
\op{iDec} & $35.1\times$ & $35.0\times$ & $31.1\times$ \\
\op{iEnc} & $10.0\times$ & $9.4\times$ & $9.4\times$ \\
\op{iKeypair}& $8.3\times$ & $7.6\times$ & $8.1\times$ \\
\op{gen\_a} & $4.7\times$ & $3.8\times$ & $4.8\times$ \\
\op{noise} & $1.4\times$ & $1.4\times$ & $1.2\times$ \\
\bottomrule
\end{tabular}
\end{table}
\subsection{Statistical Significance}
\label{sec:results:stats}
All \varref{} vs.\ \varavx{} comparisons pass the Mann-Whitney U test at
$p < 10^{-300}$. Cliff's $\delta = +1.000$ for all operations except
\op{NTT} at \mlkemk{512} and \mlkemk{1024} ($\delta = +0.999$), meaning AVX2
achieves a strictly smaller cycle count than \varref{} in effectively every
observation pair.
Figure~\ref{fig:cliffs} shows the heatmap of Cliff's $\delta$ values across
all operations and parameter sets.
\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{figures/cliffs_delta_heatmap.pdf}
\caption{Cliff's $\delta$ (\varref{} vs.\ \varavx{}) for all operations and
parameter sets. $\delta = +1$: AVX2 is faster in every observation
pair. Nearly all cells are at $+1.000$.}
\label{fig:cliffs}
\end{figure}
\subsection{Cross-Parameter Consistency}
\label{sec:results:crossparams}
Figure~\ref{fig:crossparams} shows the \varavx{} speedup for the four
per-polynomial operations across \mlkemk{512}, \mlkemk{768}, and
\mlkemk{1024}. Since all three instantiations operate on 256-coefficient
polynomials, speedups for \op{frommsg} and \op{INVNTT} should be
parameter-independent. This holds approximately: frommsg varies by only
$\pm{10\%}$, INVNTT by $\pm{6\%}$.
\op{NTT} shows a more pronounced variation ($35.5\times$ at \mlkemk{512},
$39.4\times$ at \mlkemk{768}, $34.6\times$ at \mlkemk{1024}) that is
statistically real (non-overlapping 95\% CIs). We attribute this to
\emph{cache state effects}: the surrounding polyvec loops that precede each
NTT call have a footprint that varies with $k$, leaving different cache
residency patterns that affect NTT latency in the scalar \varref{} path.
The AVX2 path is less sensitive because its smaller register footprint keeps
more state in vector registers.
\begin{figure}[t]
\centering
\input{figures/fig_cross_param}
\caption{Per-polynomial operation speedup (\varref{} $\to$ \varavx{}) across
security parameters. Polynomial dimension is 256 for all; variation
reflects cache-state differences in the calling context.}
\label{fig:crossparams}
\end{figure}
\subsection{Hardware Counter Breakdown}
\label{sec:results:papi}
\phasetwo{IPC, L1/L2/L3 cache miss rates, branch mispredictions via PAPI.
This section will contain bar charts of per-counter values comparing ref and
avx2 for each operation, explaining the mechanistic origins of the speedup.}
\subsection{Energy Efficiency}
\label{sec:results:energy}
\phasetwo{Intel RAPL pkg + DRAM energy readings per operation.
EDP (energy-delay product) comparison. Energy per KEM operation.}

View File

@ -0,0 +1,31 @@
% ── Supplementary: KEM-level end-to-end speedup ───────────────────────────────
\section{End-to-End KEM Speedup}
\label{sec:supp:kem}
Figure~\ref{fig:kemlevel} shows the hand-written SIMD speedup for the
top-level KEM operations: key generation (\op{kyber\_keypair}), encapsulation
(\op{kyber\_encaps}), and decapsulation (\op{kyber\_decaps}). These composite
operations aggregate the speedups of their constituent primitives, weighted by
relative cycle counts.
Decapsulation achieves the highest speedup (\speedup{6.9}--\speedup{7.1})
because it involves the largest share of arithmetic operations (two additional
NTT and INVNTT calls for re-encryption verification). Key generation achieves
the lowest (\speedup{5.3}--\speedup{5.9}) because it involves one fewer
polynomial multiplication step relative to encapsulation.
\begin{figure}[h]
\centering
\input{figures/fig_kem_level}
\caption{End-to-end KEM speedup (\varref{} $\to$ \varavx{}) for
\op{kyber\_keypair}, \op{kyber\_encaps}, and \op{kyber\_decaps}.
Intel Xeon Platinum 8268; 95\% bootstrap CI.}
\label{fig:kemlevel}
\end{figure}
\section{Full Operation Set}
\label{sec:supp:fullops}
\todo[inline]{Full operation speedup table for all 20 benchmarked operations,
including \op{poly\_compress}, \op{poly\_decompress}, \op{polyvec\_compress},
\op{poly\_tomsg}, and the \texttt{*\_derand} KEM variants.}

49
slurm/build.sh Executable file
View File

@ -0,0 +1,49 @@
#!/bin/bash
# Build all benchmark binaries on the HPC login node.
#
# Usage: bash slurm/build.sh [--papi] [--rapl]
#
# Run this once after rsyncing, before submitting jobs.
# Binaries are written to harness/build-hpc/.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
BUILD_DIR="${REPO_ROOT}/harness/build-hpc"
WITH_PAPI=OFF
WITH_RAPL=OFF
for arg in "$@"; do
case "$arg" in
--papi) WITH_PAPI=ON ;;
--rapl) WITH_RAPL=ON ;;
*) echo "unknown flag: $arg" >&2; exit 1 ;;
esac
done
echo "=== pqc-bench build ==="
echo "REPO_ROOT : $REPO_ROOT"
echo "BUILD_DIR : $BUILD_DIR"
echo "WITH_PAPI : $WITH_PAPI"
echo "WITH_RAPL : $WITH_RAPL"
echo "CC : ${CC:-default}"
echo "DATE : $(date -Iseconds)"
# Ensure submodule is populated.
if [[ ! -f "${REPO_ROOT}/algorithms/kyber/ref/kem.c" ]]; then
echo "Populating git submodules..."
git -C "$REPO_ROOT" submodule update --init --recursive
fi
cmake \
-B "$BUILD_DIR" \
-S "${REPO_ROOT}/harness" \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_PAPI="${WITH_PAPI}" \
-DWITH_RAPL="${WITH_RAPL}"
cmake --build "$BUILD_DIR" --parallel
echo ""
echo "Built binaries:"
ls -lh "${BUILD_DIR}"/bench_mlkem* 2>/dev/null || echo "(none found)"

85
slurm/submit.sh Executable file
View File

@ -0,0 +1,85 @@
#!/bin/bash
# Instantiate and submit SLURM benchmark jobs.
#
# Usage: bash slurm/submit.sh [--papi] [--nspins N] [--params LIST] [--variants LIST] [--node NODE]
#
# Examples:
# bash slurm/submit.sh
# bash slurm/submit.sh --papi --nspins 500
# bash slurm/submit.sh --variants "ref avx2" --params "512 1024"
# bash slurm/submit.sh --node node2334 # pin all jobs to a specific node
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
TEMPLATE="${REPO_ROOT}/slurm/templates/bench_mlkem.sh.tmpl"
# ── Defaults ─────────────────────────────────────────────────────────────────
NSPINS=1000
WITH_PAPI=OFF
PARAMS="512 768 1024"
VARIANTS="ref avx2 refnv refo0"
BENCH_NODE=""
while [[ $# -gt 0 ]]; do
case "$1" in
--papi) WITH_PAPI=ON ;;
--nspins) shift; NSPINS="$1" ;;
--params) shift; PARAMS="$1" ;;
--variants) shift; VARIANTS="$1" ;;
--node) shift; BENCH_NODE="$1" ;;
*) echo "unknown flag: $1" >&2; exit 1 ;;
esac
shift
done
# Build directory created by build.sh.
BUILD_DIR="${REPO_ROOT}/harness/build-hpc"
if [[ ! -d "$BUILD_DIR" ]]; then
echo "ERROR: $BUILD_DIR not found — run slurm/build.sh first" >&2
exit 1
fi
echo "=== pqc-bench submit ==="
echo "NSPINS : $NSPINS"
echo "WITH_PAPI: $WITH_PAPI"
echo "PARAMS : $PARAMS"
echo "VARIANTS : $VARIANTS"
echo "NODE : ${BENCH_NODE:-any}"
echo ""
JOBS_SUBMITTED=0
for PARAM in $PARAMS; do
for VARIANT in $VARIANTS; do
BINARY="${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
if [[ ! -x "$BINARY" ]]; then
echo "SKIP bench_mlkem${PARAM}_${VARIANT} — binary not found"
continue
fi
# Output goes into data/raw/kyber/mlkem{PARAM}/{VARIANT}/ so the aggregation
# tool infers algorithm and variant from the directory structure.
OUTPUT_DIR="${REPO_ROOT}/data/raw/kyber/mlkem${PARAM}/${VARIANT}"
mkdir -p "$OUTPUT_DIR"
# Instantiate template.
JOB_SCRIPT="$(mktemp /tmp/bench_mlkem${PARAM}_${VARIANT}.XXXXXX.sh)"
export PARAM VARIANT NSPINS BUILD_DIR OUTPUT_DIR WITH_PAPI BENCH_NODE
envsubst '${PARAM} ${VARIANT} ${NSPINS} ${BUILD_DIR} ${OUTPUT_DIR} ${WITH_PAPI} ${BENCH_NODE}' \
< "$TEMPLATE" > "$JOB_SCRIPT"
chmod +x "$JOB_SCRIPT"
SBATCH_ARGS="--parsable"
if [[ -n "$BENCH_NODE" ]]; then
SBATCH_ARGS="$SBATCH_ARGS --nodelist=$BENCH_NODE"
fi
JOB_ID=$(sbatch $SBATCH_ARGS "$JOB_SCRIPT")
echo "SUBMIT bench_mlkem${PARAM}_${VARIANT} job=${JOB_ID} out=${OUTPUT_DIR}/${JOB_ID}.out"
JOBS_SUBMITTED=$((JOBS_SUBMITTED + 1))
done
done
echo ""
echo "Submitted $JOBS_SUBMITTED jobs."

View File

@ -1,38 +1,48 @@
#!/bin/bash
# Template SLURM job for ML-KEM benchmarking.
# Variables filled in by slurm/submit.sh:
# PARAM — 512 | 768 | 1024
# VARIANT — ref | refnv | avx2 | ...
# NTESTS — iterations per operation (default 10000)
# BINARY — path to compiled benchmark binary
# SLURM job template for ML-KEM benchmarking.
# Instantiated by slurm/submit.sh — do not submit directly.
#
# Template variables (filled by envsubst in submit.sh):
# PARAM — 512 | 768 | 1024
# VARIANT — ref | avx2 | refnv | refo0
# NSPINS — outer loop iterations (default 1000)
# BUILD_DIR — path to directory containing the benchmark binaries
# OUTPUT_DIR — directory where this job's .out file is written
#SBATCH -J bench_mlkem${PARAM}_${VARIANT}
#SBATCH -p batch
#SBATCH -n 1
#SBATCH --mem=2G
#SBATCH -t 02:00:00
#SBATCH --constraint=intel
#SBATCH -o %j_mlkem${PARAM}_${VARIANT}.out
#SBATCH -c 1
#SBATCH --mem=256M
#SBATCH -t 00:45:00
#SBATCH -o ${OUTPUT_DIR}/%j.out
# Pin to a single core, disable frequency scaling for deterministic measurements.
# Requires appropriate OSCAR allocation; skip if unavailable.
export GOMP_CPU_AFFINITY="0"
# ── Environment ──────────────────────────────────────────────────────────────
# Pin to a single logical core for deterministic measurements.
taskset -cp 0 $$ 2>/dev/null || true
NTESTS=${NTESTS:-10000}
BINARY=${BINARY:-./bench_mlkem${PARAM}_${VARIANT}}
# Disable CPU frequency scaling if we have permission; ignore otherwise.
cpupower frequency-set -g performance 2>/dev/null || true
# ── Metadata (parsed by analysis/pkg/parse) ──────────────────────────────────
# These ## lines are picked up by the parser alongside the OSCAR prolog lines.
echo "## BENCH_VARIANT : ${VARIANT}"
echo "## BENCH_PARAM : ${PARAM}"
echo "## BENCH_NSPINS : ${NSPINS}"
echo "## BENCH_NODE_REQ : ${BENCH_NODE}"
echo "## BENCH_BINARY : ${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
echo "## BENCH_DATE : $(date -Iseconds)"
echo "## CPU_MODEL : $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)"
echo "## PERF_PARANOID : $(cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo unknown)"
echo "## PAPI_BUILD : ${WITH_PAPI:-OFF}"
BINARY="${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
NSPINS="${NSPINS:-1000}"
if [[ ! -x "$BINARY" ]]; then
echo "ERROR: binary not found or not executable: $BINARY" >&2
exit 1
fi
echo "=== bench_mlkem${PARAM}_${VARIANT} ==="
echo "SLURM_JOB_ID: $SLURM_JOB_ID"
echo "SLURM_NODELIST: $SLURM_NODELIST"
echo "NTESTS: $NTESTS"
echo "DATE: $(date -Iseconds)"
echo "UNAME: $(uname -a)"
echo "CPU: $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)"
echo "---"
"$BINARY" "$NTESTS"
# ── Run ───────────────────────────────────────────────────────────────────────
"$BINARY" "$NSPINS"