Phase 1
This commit is contained in:
parent
7750ae3d8c
commit
00ced380f9
|
|
@ -0,0 +1,3 @@
|
|||
[submodule "algorithms/kyber"]
|
||||
path = algorithms/kyber
|
||||
url = https://github.com/pq-crystals/kyber
|
||||
|
|
@ -0,0 +1 @@
|
|||
Subproject commit 4768bd37c02f9c40a46cb49d4d1f4d5e612bb882
|
||||
|
|
@ -0,0 +1,286 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Statistical analysis of pqc-bench results.
|
||||
|
||||
Parses .out files via the Go aggregator, then computes a three-way
|
||||
decomposition of where speedup originates:
|
||||
|
||||
refo0 → refnv compiler optimisation (O3, no vectorisation)
|
||||
refnv → ref compiler auto-vectorisation
|
||||
ref → avx2 hand-written SIMD
|
||||
|
||||
Usage:
|
||||
# Run aggregator inline:
|
||||
python3 analysis/analyze.py --data data/raw/kyber
|
||||
|
||||
# Or pre-generate the raw JSON once, then reuse it:
|
||||
go run ./analysis/cmd/aggregate --raw --out /tmp/bench.json data/raw/kyber
|
||||
python3 analysis/analyze.py --json /tmp/bench.json
|
||||
|
||||
# Write JSON output for figure generation:
|
||||
python3 analysis/analyze.py --data data/raw/kyber --out analysis/results.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from scipy import stats as scipy_stats
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data loading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
def load_json(path: str) -> list[dict]:
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def run_aggregator(data_dir: str) -> list[dict]:
|
||||
"""Run the Go aggregator and return parsed records."""
|
||||
cmd = ["go", "run", "./cmd/aggregate", "--raw", data_dir]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, cwd=REPO_ROOT / "analysis")
|
||||
if result.returncode != 0:
|
||||
print(f"aggregator failed:\n{result.stderr}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return json.loads(result.stdout)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Statistics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def cliffs_delta_from_u(u: float, m: int, n: int) -> float:
|
||||
"""Cliff's delta derived from Mann-Whitney U statistic.
|
||||
|
||||
U = number of pairs (faster_i, baseline_j) where faster_i < baseline_j.
|
||||
delta = (2U - m*n) / (m*n) ∈ [-1, +1]
|
||||
Positive → faster dominates baseline.
|
||||
"""
|
||||
return (2 * u - m * n) / (m * n)
|
||||
|
||||
|
||||
def bootstrap_speedup_ci(
|
||||
baseline: np.ndarray,
|
||||
faster: np.ndarray,
|
||||
n_boot: int = 5_000,
|
||||
ci: float = 0.95,
|
||||
rng: np.random.Generator | None = None,
|
||||
) -> tuple[float, float]:
|
||||
"""95% bootstrap CI for speedup = median(baseline) / median(faster).
|
||||
|
||||
Resamples both arrays independently using vectorised indexing; returns (lo, hi).
|
||||
"""
|
||||
if rng is None:
|
||||
rng = np.random.default_rng(42)
|
||||
m, n = len(baseline), len(faster)
|
||||
# Draw all indices at once: shape (n_boot, m) and (n_boot, n)
|
||||
bi = rng.integers(0, m, size=(n_boot, m))
|
||||
fi = rng.integers(0, n, size=(n_boot, n))
|
||||
b_samples = baseline[bi] # (n_boot, m)
|
||||
f_samples = faster[fi] # (n_boot, n)
|
||||
# Median along axis=1 for each bootstrap replicate
|
||||
ratios = np.median(b_samples, axis=1) / np.median(f_samples, axis=1)
|
||||
alpha = (1 - ci) / 2
|
||||
return float(np.percentile(ratios, alpha * 100)), float(np.percentile(ratios, (1 - alpha) * 100))
|
||||
|
||||
|
||||
def compare(baseline: np.ndarray, faster: np.ndarray, rng: np.random.Generator) -> dict:
|
||||
"""Full pairwise comparison: speedup + CI + Mann-Whitney + Cliff's delta."""
|
||||
speedup = float(np.median(baseline)) / float(np.median(faster))
|
||||
ci_lo, ci_hi = bootstrap_speedup_ci(baseline, faster, rng=rng)
|
||||
|
||||
# One-sided Mann-Whitney: is faster < baseline in cycle counts?
|
||||
m, n = len(faster), len(baseline)
|
||||
u_stat, p_val = scipy_stats.mannwhitneyu(faster, baseline, alternative="less")
|
||||
|
||||
# Cliff's delta derived from U — O(n log n), same cost as Mann-Whitney
|
||||
delta = cliffs_delta_from_u(float(u_stat), m, n)
|
||||
|
||||
return {
|
||||
"speedup": speedup,
|
||||
"ci95": [ci_lo, ci_hi],
|
||||
"mannwhitney_p": float(p_val),
|
||||
"cliffs_delta": delta,
|
||||
"n_baseline": n,
|
||||
"n_faster": m,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Analysis
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
VARIANTS = ("refo0", "refnv", "ref", "avx2")
|
||||
|
||||
# Canonical operation order for display
|
||||
OP_ORDER = [
|
||||
"NTT", "INVNTT", "basemul", "frommsg",
|
||||
"gen_a", "poly_getnoise_eta1", "poly_getnoise_eta2",
|
||||
"keygen", "enc", "dec",
|
||||
]
|
||||
|
||||
|
||||
def analyze(records: list[dict]) -> list[dict]:
|
||||
# Build lookup: (algorithm, variant, operation) → raw array
|
||||
raw: dict[tuple[str, str, str], np.ndarray] = {}
|
||||
for r in records:
|
||||
if r.get("raw"):
|
||||
raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64)
|
||||
|
||||
# Collect all (algorithm, operation) pairs present across all variants
|
||||
alg_ops = sorted(
|
||||
{(alg, op) for alg, var, op in raw},
|
||||
key=lambda x: (x[0], _op_rank(x[1])),
|
||||
)
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
results = []
|
||||
for alg, op in alg_ops:
|
||||
arrays = {v: raw[(alg, v, op)] for v in VARIANTS if (alg, v, op) in raw}
|
||||
|
||||
if len(arrays) < 2:
|
||||
continue
|
||||
|
||||
row: dict = {
|
||||
"algorithm": alg,
|
||||
"operation": op,
|
||||
"medians": {v: float(np.median(a)) for v, a in arrays.items()},
|
||||
"n_obs": {v: len(a) for v, a in arrays.items()},
|
||||
"comparisons": {},
|
||||
}
|
||||
|
||||
comps = row["comparisons"]
|
||||
|
||||
# Three-way decomposition (each step requires both variants present)
|
||||
if "refo0" in arrays and "refnv" in arrays:
|
||||
comps["refo0_to_refnv"] = compare(arrays["refo0"], arrays["refnv"], rng)
|
||||
|
||||
if "refnv" in arrays and "ref" in arrays:
|
||||
comps["refnv_to_ref"] = compare(arrays["refnv"], arrays["ref"], rng)
|
||||
|
||||
if "ref" in arrays and "avx2" in arrays:
|
||||
comps["ref_to_avx2"] = compare(arrays["ref"], arrays["avx2"], rng)
|
||||
|
||||
# Totals
|
||||
if "refo0" in arrays and "ref" in arrays:
|
||||
comps["refo0_to_ref"] = compare(arrays["refo0"], arrays["ref"], rng)
|
||||
|
||||
if "refo0" in arrays and "avx2" in arrays:
|
||||
comps["refo0_to_avx2"] = compare(arrays["refo0"], arrays["avx2"], rng)
|
||||
|
||||
results.append(row)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _op_rank(op: str) -> int:
|
||||
try:
|
||||
return OP_ORDER.index(op)
|
||||
except ValueError:
|
||||
return len(OP_ORDER)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Display
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _fmt_speedup(comp: dict | None) -> str:
|
||||
if comp is None:
|
||||
return " — "
|
||||
r = comp["speedup"]
|
||||
lo, hi = comp["ci95"]
|
||||
return f"{r:5.2f}x [{lo:.2f},{hi:.2f}]"
|
||||
|
||||
|
||||
def _fmt_delta(comp: dict | None) -> str:
|
||||
if comp is None:
|
||||
return " —"
|
||||
return f"{comp['cliffs_delta']:+.3f}"
|
||||
|
||||
|
||||
def _fmt_p(comp: dict | None) -> str:
|
||||
if comp is None:
|
||||
return " —"
|
||||
p = comp["mannwhitney_p"]
|
||||
if p < 1e-300:
|
||||
return " <1e-300"
|
||||
if p < 1e-10:
|
||||
return f" {p:.1e}"
|
||||
return f" {p:.4f}"
|
||||
|
||||
|
||||
def print_table(results: list[dict]) -> None:
|
||||
algs = sorted({r["algorithm"] for r in results})
|
||||
|
||||
for alg in algs:
|
||||
rows = [r for r in results if r["algorithm"] == alg]
|
||||
rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0))
|
||||
|
||||
print(f"\n{'─'*110}")
|
||||
print(f" {alg.upper()}")
|
||||
print(f"{'─'*110}")
|
||||
print(
|
||||
f" {'Operation':<24}"
|
||||
f" {'O3 (no-vec)':>18}" # refo0→refnv
|
||||
f" {'Auto-vec':>18}" # refnv→ref
|
||||
f" {'Hand SIMD':>18}" # ref→avx2
|
||||
f" {'Total':>18}" # refo0→avx2
|
||||
f" {'Cliff δ':>7}"
|
||||
f" {'p-value':>9}"
|
||||
)
|
||||
print(f" {'':─<24} {'':─<18} {'':─<18} {'':─<18} {'':─<18} {'':─<7} {'':─<9}")
|
||||
|
||||
for r in rows:
|
||||
c = r["comparisons"]
|
||||
print(
|
||||
f" {r['operation']:<24}"
|
||||
f" {_fmt_speedup(c.get('refo0_to_refnv')):>18}"
|
||||
f" {_fmt_speedup(c.get('refnv_to_ref')):>18}"
|
||||
f" {_fmt_speedup(c.get('ref_to_avx2')):>18}"
|
||||
f" {_fmt_speedup(c.get('refo0_to_avx2')):>18}"
|
||||
f" {_fmt_delta(c.get('ref_to_avx2')):>7}"
|
||||
f" {_fmt_p(c.get('ref_to_avx2')):>9}"
|
||||
)
|
||||
|
||||
print(f"\n{'─'*110}")
|
||||
print(" Speedup = median(baseline) / median(variant); CI: 95% bootstrap (5000 iterations)")
|
||||
print(" Cliff δ and p-value are for ref → avx2 comparison (H1: avx2 cycles < ref cycles)")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Statistical analysis of pqc-bench results")
|
||||
src = parser.add_mutually_exclusive_group(required=True)
|
||||
src.add_argument("--data", metavar="DIR", help="data directory (runs Go aggregator)")
|
||||
src.add_argument("--json", metavar="FILE", help="pre-generated aggregate JSON with --raw")
|
||||
parser.add_argument("--out", metavar="FILE", help="write analysis JSON to this file")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.json:
|
||||
records = load_json(args.json)
|
||||
print(f"Loaded {len(records)} groups from {args.json}.", file=sys.stderr)
|
||||
else:
|
||||
print("Running aggregator...", file=sys.stderr)
|
||||
records = run_aggregator(args.data)
|
||||
print(f"Loaded {len(records)} groups.", file=sys.stderr)
|
||||
|
||||
results = analyze(records)
|
||||
print_table(results)
|
||||
|
||||
if args.out:
|
||||
with open(args.out, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"\nWrote analysis JSON to {args.out}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,215 @@
|
|||
// aggregate parses pqc-bench .out files and emits summary statistics as JSON.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// aggregate [--raw] [--out results.json] <data-dir>
|
||||
//
|
||||
// It walks <data-dir> for all *.out files, grouping results by the parent
|
||||
// directory name (algorithm) and the variant inferred from the SLURM header.
|
||||
// Output is a JSON array of result objects, one per (algorithm, variant,
|
||||
// operation) triple.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"git.levineuwirth.org/neuwirth/where-simd-helps/analysis/pkg/parse"
|
||||
"git.levineuwirth.org/neuwirth/where-simd-helps/analysis/pkg/stats"
|
||||
)
|
||||
|
||||
// Result is one output record: all statistics for a single
|
||||
// (algorithm, variant, operation) group.
|
||||
type Result struct {
|
||||
Algorithm string `json:"algorithm"`
|
||||
Variant string `json:"variant"`
|
||||
Operation string `json:"operation"`
|
||||
Unit string `json:"unit"`
|
||||
NObservations int `json:"n_observations"`
|
||||
NRuns int `json:"n_runs"`
|
||||
Median float64 `json:"median"`
|
||||
Mean float64 `json:"mean"`
|
||||
Std float64 `json:"std"`
|
||||
MAD float64 `json:"mad"`
|
||||
P5 float64 `json:"p5"`
|
||||
P25 float64 `json:"p25"`
|
||||
P75 float64 `json:"p75"`
|
||||
P95 float64 `json:"p95"`
|
||||
P99 float64 `json:"p99"`
|
||||
CI95 [2]float64 `json:"ci95"`
|
||||
Node string `json:"node"`
|
||||
Sources []string `json:"sources"`
|
||||
Raw []int64 `json:"raw,omitempty"`
|
||||
}
|
||||
|
||||
// groupKey uniquely identifies a (algorithm, variant, operation) combination.
|
||||
type groupKey struct {
|
||||
algorithm, variant, operation string
|
||||
}
|
||||
|
||||
func main() {
|
||||
rawFlag := flag.Bool("raw", false, "include per-observation cycle counts in output")
|
||||
outFlag := flag.String("out", "", "write JSON output to this file instead of stdout")
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: aggregate [--raw] [--out FILE] <data-dir>\n")
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
flag.Parse()
|
||||
|
||||
if flag.NArg() != 1 {
|
||||
flag.Usage()
|
||||
os.Exit(1)
|
||||
}
|
||||
dataDir := flag.Arg(0)
|
||||
|
||||
// Collect all .out files.
|
||||
var outFiles []string
|
||||
err := filepath.WalkDir(dataDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !d.IsDir() && strings.HasSuffix(path, ".out") {
|
||||
outFiles = append(outFiles, path)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error walking %s: %v\n", dataDir, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
if len(outFiles) == 0 {
|
||||
fmt.Fprintf(os.Stderr, "no .out files found under %s\n", dataDir)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Parse every file and accumulate observations per group.
|
||||
type accumulator struct {
|
||||
values []int64
|
||||
sources []string
|
||||
node string
|
||||
}
|
||||
groups := make(map[groupKey]*accumulator)
|
||||
|
||||
for _, path := range outFiles {
|
||||
run, err := parse.ParseFile(path)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "warning: skipping %s: %v\n", path, err)
|
||||
continue
|
||||
}
|
||||
|
||||
algorithm := inferAlgorithm(run.Meta, path)
|
||||
variant := parse.InferVariant(run.Meta)
|
||||
|
||||
for _, spin := range run.Spins {
|
||||
for op, m := range spin {
|
||||
key := groupKey{algorithm, variant, op}
|
||||
acc := groups[key]
|
||||
if acc == nil {
|
||||
acc = &accumulator{node: run.Meta.Node}
|
||||
groups[key] = acc
|
||||
}
|
||||
acc.values = append(acc.values, m.Median)
|
||||
}
|
||||
}
|
||||
|
||||
// Record sources per group (any key with this algorithm+variant).
|
||||
for key, acc := range groups {
|
||||
if key.algorithm == algorithm && key.variant == variant {
|
||||
if !slices.Contains(acc.sources, path) {
|
||||
acc.sources = append(acc.sources, path)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build results.
|
||||
results := make([]Result, 0, len(groups))
|
||||
for key, acc := range groups {
|
||||
sorted := make([]int64, len(acc.values))
|
||||
copy(sorted, acc.values)
|
||||
stats.SortInt64(sorted)
|
||||
|
||||
s := stats.Compute(sorted)
|
||||
|
||||
r := Result{
|
||||
Algorithm: key.algorithm,
|
||||
Variant: key.variant,
|
||||
Operation: key.operation,
|
||||
Unit: "cycles",
|
||||
NObservations: s.N,
|
||||
NRuns: len(acc.sources),
|
||||
Median: s.Median,
|
||||
Mean: s.Mean,
|
||||
Std: s.Std,
|
||||
MAD: s.MAD,
|
||||
P5: s.P5,
|
||||
P25: s.P25,
|
||||
P75: s.P75,
|
||||
P95: s.P95,
|
||||
P99: s.P99,
|
||||
CI95: s.CI95,
|
||||
Node: acc.node,
|
||||
Sources: acc.sources,
|
||||
}
|
||||
if *rawFlag {
|
||||
r.Raw = acc.values
|
||||
}
|
||||
results = append(results, r)
|
||||
}
|
||||
|
||||
// Sort for stable output: algorithm → variant → operation.
|
||||
slices.SortFunc(results, func(a, b Result) int {
|
||||
if a.Algorithm != b.Algorithm {
|
||||
return strings.Compare(a.Algorithm, b.Algorithm)
|
||||
}
|
||||
if a.Variant != b.Variant {
|
||||
return strings.Compare(a.Variant, b.Variant)
|
||||
}
|
||||
return strings.Compare(a.Operation, b.Operation)
|
||||
})
|
||||
|
||||
out, err := json.MarshalIndent(results, "", " ")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error marshalling JSON: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if *outFlag != "" {
|
||||
if err := os.WriteFile(*outFlag, out, 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error writing %s: %v\n", *outFlag, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "wrote %d results to %s\n", len(results), *outFlag)
|
||||
} else {
|
||||
fmt.Println(string(out))
|
||||
}
|
||||
}
|
||||
|
||||
// inferAlgorithm returns the algorithm name (e.g. "mlkem512") for a run.
|
||||
//
|
||||
// Priority:
|
||||
// 1. BENCH_PARAM metadata → "mlkem{PARAM}" (new-style runs via submit.sh)
|
||||
// 2. Walk the file path upward for a segment matching "mlkem\d+" (handles
|
||||
// both flat old-style layout and new nested layout transparently)
|
||||
// 3. The immediate parent directory name as a last resort.
|
||||
func inferAlgorithm(meta parse.Meta, filePath string) string {
|
||||
if meta.BenchParam != "" {
|
||||
return "mlkem" + meta.BenchParam
|
||||
}
|
||||
// Walk path components looking for mlkem\d+.
|
||||
dir := filepath.Dir(filePath)
|
||||
for dir != "." && dir != "/" {
|
||||
base := filepath.Base(dir)
|
||||
if strings.HasPrefix(base, "mlkem") {
|
||||
return base
|
||||
}
|
||||
dir = filepath.Dir(dir)
|
||||
}
|
||||
return filepath.Base(filepath.Dir(filePath))
|
||||
}
|
||||
|
|
@ -0,0 +1,242 @@
|
|||
// analyze-simd computes speedup ratios from aggregated pqc-bench results.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// analyze-simd [--baseline ref] [--in results.json] [--out speedups.json]
|
||||
//
|
||||
// It reads the JSON produced by 'aggregate', computes per-operation speedups
|
||||
// relative to the baseline variant, and emits both a human-readable table
|
||||
// and a structured JSON file suitable for downstream plotting.
|
||||
package main
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
"text/tabwriter"
|
||||
)
|
||||
|
||||
// Record mirrors the aggregate output schema (fields we need).
|
||||
type Record struct {
|
||||
Algorithm string `json:"algorithm"`
|
||||
Variant string `json:"variant"`
|
||||
Operation string `json:"operation"`
|
||||
Median float64 `json:"median"`
|
||||
CI95 [2]float64 `json:"ci95"`
|
||||
NRuns int `json:"n_runs"`
|
||||
}
|
||||
|
||||
// Speedup is one variant-vs-baseline comparison for a single (algorithm, operation).
|
||||
type Speedup struct {
|
||||
Variant string `json:"variant"`
|
||||
Median float64 `json:"median"`
|
||||
Speedup float64 `json:"speedup"`
|
||||
SpeedupCI [2]float64 `json:"speedup_ci95"`
|
||||
}
|
||||
|
||||
// Result is one output row: all comparisons for one (algorithm, operation) pair.
|
||||
type Result struct {
|
||||
Algorithm string `json:"algorithm"`
|
||||
Operation string `json:"operation"`
|
||||
BaselineVariant string `json:"baseline_variant"`
|
||||
BaselineMedian float64 `json:"baseline_median"`
|
||||
BaselineCI95 [2]float64 `json:"baseline_ci95"`
|
||||
Comparisons []Speedup `json:"comparisons"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
baseline := flag.String("baseline", "ref", "variant to use as the speedup denominator")
|
||||
inFile := flag.String("in", "results/kyber.json", "input JSON from aggregate")
|
||||
outFile := flag.String("out", "", "write speedup JSON to this file (default: stdout)")
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: analyze-simd [--baseline VARIANT] [--in FILE] [--out FILE]\n")
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
flag.Parse()
|
||||
|
||||
raw, err := os.ReadFile(*inFile)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error reading %s: %v\n", *inFile, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
var records []Record
|
||||
if err := json.Unmarshal(raw, &records); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error parsing JSON: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Index by (algorithm, variant, operation).
|
||||
type key struct{ algorithm, variant, operation string }
|
||||
idx := make(map[key]Record, len(records))
|
||||
for _, r := range records {
|
||||
idx[key{r.Algorithm, r.Variant, r.Operation}] = r
|
||||
}
|
||||
|
||||
// Collect sorted unique values for stable output.
|
||||
algorithms := unique(records, func(r Record) string { return r.Algorithm })
|
||||
operations := unique(records, func(r Record) string { return r.Operation })
|
||||
variants := unique(records, func(r Record) string { return r.Variant })
|
||||
// Remove baseline from comparison variants.
|
||||
variants = slices.DeleteFunc(variants, func(v string) bool { return v == *baseline })
|
||||
|
||||
// Build results.
|
||||
var results []Result
|
||||
for _, alg := range algorithms {
|
||||
for _, op := range operations {
|
||||
baseRec, ok := idx[key{alg, *baseline, op}]
|
||||
if !ok || baseRec.Median == 0 {
|
||||
continue
|
||||
}
|
||||
res := Result{
|
||||
Algorithm: alg,
|
||||
Operation: op,
|
||||
BaselineVariant: *baseline,
|
||||
BaselineMedian: baseRec.Median,
|
||||
BaselineCI95: baseRec.CI95,
|
||||
}
|
||||
for _, v := range variants {
|
||||
cmpRec, ok := idx[key{alg, v, op}]
|
||||
if !ok || cmpRec.Median == 0 {
|
||||
continue
|
||||
}
|
||||
sp := baseRec.Median / cmpRec.Median
|
||||
// Conservative CI: ratio of interval bounds.
|
||||
// speedup_lo = baseline_lo / cmp_hi
|
||||
// speedup_hi = baseline_hi / cmp_lo
|
||||
var spCI [2]float64
|
||||
if cmpRec.CI95[1] > 0 {
|
||||
spCI[0] = safeDiv(baseRec.CI95[0], cmpRec.CI95[1])
|
||||
}
|
||||
if cmpRec.CI95[0] > 0 {
|
||||
spCI[1] = safeDiv(baseRec.CI95[1], cmpRec.CI95[0])
|
||||
}
|
||||
res.Comparisons = append(res.Comparisons, Speedup{
|
||||
Variant: v,
|
||||
Median: cmpRec.Median,
|
||||
Speedup: sp,
|
||||
SpeedupCI: spCI,
|
||||
})
|
||||
}
|
||||
if len(res.Comparisons) > 0 {
|
||||
results = append(results, res)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Print human-readable table to stderr.
|
||||
printTable(os.Stderr, results, variants, *baseline)
|
||||
|
||||
// Emit JSON.
|
||||
out, err := json.MarshalIndent(results, "", " ")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error marshalling JSON: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
if *outFile != "" {
|
||||
if err := os.WriteFile(*outFile, out, 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error writing %s: %v\n", *outFile, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "wrote %d results to %s\n", len(results), *outFile)
|
||||
} else {
|
||||
fmt.Println(string(out))
|
||||
}
|
||||
}
|
||||
|
||||
func printTable(w *os.File, results []Result, variants []string, baseline string) {
|
||||
tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0)
|
||||
|
||||
// Group by algorithm.
|
||||
byAlg := make(map[string][]Result)
|
||||
for _, r := range results {
|
||||
byAlg[r.Algorithm] = append(byAlg[r.Algorithm], r)
|
||||
}
|
||||
algs := make([]string, 0, len(byAlg))
|
||||
for a := range byAlg {
|
||||
algs = append(algs, a)
|
||||
}
|
||||
slices.Sort(algs)
|
||||
|
||||
for _, alg := range algs {
|
||||
fmt.Fprintf(tw, "\n── %s (baseline: %s) ──\n", strings.ToUpper(alg), baseline)
|
||||
|
||||
// Header.
|
||||
var hdr strings.Builder
|
||||
fmt.Fprintf(&hdr, "%-38s\t%12s", "operation", baseline+"(cycles)")
|
||||
for _, v := range variants {
|
||||
fmt.Fprintf(&hdr, "\t%10s", v)
|
||||
}
|
||||
fmt.Fprintln(tw, hdr.String())
|
||||
fmt.Fprintln(tw, strings.Repeat("-", 38+13+11*len(variants)))
|
||||
|
||||
rows := byAlg[alg]
|
||||
slices.SortFunc(rows, func(a, b Result) int {
|
||||
// Sort by descending avx2 speedup if available, else alphabetically.
|
||||
sa := speedupFor(a, "avx2")
|
||||
sb := speedupFor(b, "avx2")
|
||||
if sa != sb {
|
||||
return cmp.Compare(sb, sa) // descending
|
||||
}
|
||||
return strings.Compare(a.Operation, b.Operation)
|
||||
})
|
||||
|
||||
for _, r := range rows {
|
||||
var line strings.Builder
|
||||
fmt.Fprintf(&line, "%-38s\t%12s", r.Operation, formatCycles(r.BaselineMedian))
|
||||
for _, v := range variants {
|
||||
sp := speedupFor(r, v)
|
||||
if math.IsNaN(sp) {
|
||||
fmt.Fprintf(&line, "\t%10s", "---")
|
||||
} else {
|
||||
fmt.Fprintf(&line, "\t%9.2fx", sp)
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(tw, line.String())
|
||||
}
|
||||
}
|
||||
tw.Flush()
|
||||
}
|
||||
|
||||
func speedupFor(r Result, variant string) float64 {
|
||||
for _, c := range r.Comparisons {
|
||||
if c.Variant == variant {
|
||||
return c.Speedup
|
||||
}
|
||||
}
|
||||
return math.NaN()
|
||||
}
|
||||
|
||||
func formatCycles(c float64) string {
|
||||
if c >= 1_000_000 {
|
||||
return fmt.Sprintf("%.2fM", c/1_000_000)
|
||||
}
|
||||
if c >= 1_000 {
|
||||
return fmt.Sprintf("%.1fK", c/1_000)
|
||||
}
|
||||
return fmt.Sprintf("%.0f", c)
|
||||
}
|
||||
|
||||
func safeDiv(a, b float64) float64 {
|
||||
if b == 0 {
|
||||
return 0
|
||||
}
|
||||
return a / b
|
||||
}
|
||||
|
||||
func unique(records []Record, fn func(Record) string) []string {
|
||||
seen := make(map[string]struct{})
|
||||
for _, r := range records {
|
||||
seen[fn(r)] = struct{}{}
|
||||
}
|
||||
out := make([]string, 0, len(seen))
|
||||
for k := range seen {
|
||||
out = append(out, k)
|
||||
}
|
||||
slices.Sort(out)
|
||||
return out
|
||||
}
|
||||
|
|
@ -0,0 +1,487 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Matplotlib draft figures for the PQC SIMD speedup analysis.
|
||||
|
||||
Usage:
|
||||
python3 analysis/figures.py [--json analysis/results.json] [--out figures/]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
import numpy as np
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Cumulative stages used in Figure 1 (each shows total speedup from refo0)
|
||||
STAGE_KEYS = ["refo0_to_refnv", "refo0_to_ref", "refo0_to_avx2"]
|
||||
STAGE_LABELS = ["O3, no auto-vec", "O3 + auto-vec", "O3 + hand SIMD (avx2)"]
|
||||
STAGE_COLORS = ["#4C72B0", "#55A868", "#C44E52"]
|
||||
|
||||
# Ops to show in the primary figures (excludes top-level KEM wrappers)
|
||||
PRIMARY_OPS = {
|
||||
"poly_frommsg", "INVNTT", "polyvec_basemul_acc_montgomery", "NTT",
|
||||
"indcpa_dec", "polyvec_decompress", "poly_decompress",
|
||||
"poly_compress", "poly_tomsg", "polyvec_compress",
|
||||
"indcpa_enc", "indcpa_keypair", "gen_a",
|
||||
"poly_getnoise_eta1", "poly_getnoise_eta2",
|
||||
}
|
||||
|
||||
# Short display names
|
||||
OP_SHORT = {
|
||||
"poly_frommsg": "frommsg",
|
||||
"INVNTT": "INVNTT",
|
||||
"polyvec_basemul_acc_montgomery": "basemul",
|
||||
"NTT": "NTT",
|
||||
"indcpa_dec": "dec",
|
||||
"polyvec_decompress": "pvec_decomp",
|
||||
"poly_decompress": "poly_decomp",
|
||||
"poly_compress": "poly_comp",
|
||||
"poly_tomsg": "tomsg",
|
||||
"polyvec_compress": "pvec_comp",
|
||||
"indcpa_enc": "enc",
|
||||
"indcpa_keypair": "keypair",
|
||||
"gen_a": "gen_a",
|
||||
"poly_getnoise_eta1": "noise_η₁",
|
||||
"poly_getnoise_eta2": "noise_η₂",
|
||||
}
|
||||
|
||||
ALGORITHMS = ["mlkem512", "mlkem768", "mlkem1024"]
|
||||
ALG_TITLES = {"mlkem512": "ML-KEM-512", "mlkem768": "ML-KEM-768", "mlkem1024": "ML-KEM-1024"}
|
||||
|
||||
# Operations selected to illustrate the distribution figure:
|
||||
# one high-speedup arithmetic op, one medium SHAKE-bound op, one low-speedup op
|
||||
DIST_OPS = [
|
||||
("INVNTT", "INVNTT\n(~55× speedup)"),
|
||||
("gen_a", "gen_a\n(~4× speedup)"),
|
||||
("poly_getnoise_eta1","noise η₁\n(~1.3× speedup)"),
|
||||
]
|
||||
|
||||
# Per-polynomial ops whose speedup should be param-independent
|
||||
CROSS_PARAM_OPS = [
|
||||
"poly_frommsg",
|
||||
"INVNTT",
|
||||
"polyvec_basemul_acc_montgomery",
|
||||
"NTT",
|
||||
]
|
||||
|
||||
# KEM-level ops for supplementary
|
||||
KEM_OPS = ["kyber_keypair", "kyber_encaps", "kyber_decaps"]
|
||||
KEM_SHORT = {"kyber_keypair": "KeyGen", "kyber_encaps": "Encaps", "kyber_decaps": "Decaps"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load(json_path: str) -> list[dict]:
|
||||
with open(json_path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def ops_for_alg(results: list[dict], alg: str) -> list[dict]:
|
||||
rows = [r for r in results if r["algorithm"] == alg and r["operation"] in PRIMARY_OPS]
|
||||
rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0))
|
||||
return rows
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Figure 1: cumulative grouped bars — speedup at each optimisation stage
|
||||
#
|
||||
# Each group shows three bars for one operation:
|
||||
# refo0→refnv total speedup with O3, auto-vec OFF
|
||||
# refo0→ref total speedup with O3, auto-vec ON
|
||||
# refo0→avx2 total speedup with O3 + hand-written SIMD
|
||||
#
|
||||
# Because all bars share the same baseline (refo0=1), they are directly
|
||||
# comparable without any additive/multiplicative ambiguity.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fig_decomposition(results: list[dict], out_dir: Path) -> None:
|
||||
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=False)
|
||||
|
||||
for ax, alg in zip(axes, ALGORITHMS):
|
||||
rows = ops_for_alg(results, alg)
|
||||
if not rows:
|
||||
ax.set_visible(False)
|
||||
continue
|
||||
|
||||
ops = [OP_SHORT.get(r["operation"], r["operation"]) for r in rows]
|
||||
n = len(rows)
|
||||
group = np.arange(n)
|
||||
# Three bars per group, evenly spaced within each group slot
|
||||
bar_w = 0.22
|
||||
offsets = np.array([-bar_w, 0, bar_w])
|
||||
|
||||
for (key, label, color), offset in zip(
|
||||
zip(STAGE_KEYS, STAGE_LABELS, STAGE_COLORS), offsets
|
||||
):
|
||||
vals = np.array([r["comparisons"].get(key, {}).get("speedup", 0.0) for r in rows])
|
||||
ci_lo = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[0] for r in rows])
|
||||
ci_hi = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[1] for r in rows])
|
||||
yerr = np.array([vals - ci_lo, ci_hi - vals])
|
||||
mask = vals > 0
|
||||
|
||||
ax.bar(group[mask] + offset, vals[mask], bar_w,
|
||||
label=label, color=color, alpha=0.88, zorder=3)
|
||||
ax.errorbar(group[mask] + offset, vals[mask], yerr=yerr[:, mask],
|
||||
fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4)
|
||||
|
||||
ax.set_yscale("log")
|
||||
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×"))
|
||||
ax.set_title(ALG_TITLES[alg], fontsize=12, fontweight="bold")
|
||||
ax.set_xticks(group)
|
||||
ax.set_xticklabels(ops, rotation=45, ha="right", fontsize=8)
|
||||
ax.set_ylabel("Speedup over -O0 (×, log scale)" if alg == "mlkem512" else "")
|
||||
ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
|
||||
ax.set_axisbelow(True)
|
||||
ax.set_xlim(-0.5, n - 0.5)
|
||||
|
||||
handles, labels = axes[0].get_legend_handles_labels()
|
||||
fig.legend(handles, labels, loc="upper center", ncol=3,
|
||||
fontsize=10, frameon=True, bbox_to_anchor=(0.5, 1.02))
|
||||
fig.suptitle(
|
||||
"ML-KEM Cumulative Speedup at Each Optimisation Stage "
|
||||
"(Intel Xeon Platinum 8268, 95% bootstrap CI)",
|
||||
fontsize=11, y=1.06,
|
||||
)
|
||||
fig.tight_layout()
|
||||
_save(fig, out_dir, "decomposition")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Figure 2: hand-SIMD speedup (ref→avx2), all algorithms overlaid, log scale
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fig_hand_simd(results: list[dict], out_dir: Path) -> None:
|
||||
all_ops: dict[str, dict] = {}
|
||||
for r in results:
|
||||
if r["operation"] in PRIMARY_OPS and "ref_to_avx2" in r["comparisons"]:
|
||||
all_ops.setdefault(r["operation"], {})
|
||||
all_ops[r["operation"]][r["algorithm"]] = r["comparisons"]["ref_to_avx2"]
|
||||
|
||||
ops_sorted = sorted(
|
||||
all_ops,
|
||||
key=lambda op: -all_ops[op].get("mlkem512", {}).get("speedup", 0),
|
||||
)
|
||||
short_ops = [OP_SHORT.get(op, op) for op in ops_sorted]
|
||||
|
||||
x = np.arange(len(ops_sorted))
|
||||
bar_w = 0.25
|
||||
offsets = [-bar_w, 0, bar_w]
|
||||
colors = ["#4C72B0", "#55A868", "#C44E52"]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(14, 5))
|
||||
|
||||
for alg, offset, color in zip(ALGORITHMS, offsets, colors):
|
||||
vals = np.array([all_ops[op].get(alg, {}).get("speedup", 0) for op in ops_sorted])
|
||||
ci_lo = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[0] for op in ops_sorted])
|
||||
ci_hi = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[1] for op in ops_sorted])
|
||||
yerr = np.array([vals - ci_lo, ci_hi - vals])
|
||||
mask = vals > 0
|
||||
|
||||
ax.bar(x[mask] + offset, vals[mask], bar_w,
|
||||
label=ALG_TITLES[alg], color=color, alpha=0.85, zorder=3)
|
||||
ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
|
||||
fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4)
|
||||
|
||||
ax.set_yscale("log")
|
||||
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×"))
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9)
|
||||
ax.set_ylabel("Speedup ref → avx2 (×, log scale)")
|
||||
ax.set_title(
|
||||
"Hand-Written SIMD Speedup over Compiler-Optimised C\n"
|
||||
"(Intel Xeon Platinum 8268, 95% bootstrap CI, n≥2000 per group)"
|
||||
)
|
||||
ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
|
||||
ax.set_axisbelow(True)
|
||||
ax.legend(fontsize=10)
|
||||
|
||||
fig.tight_layout()
|
||||
_save(fig, out_dir, "hand_simd_speedup")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Figure 3: Cliff's delta heatmap (ref→avx2)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fig_cliffs_heatmap(results: list[dict], out_dir: Path) -> None:
|
||||
ops_set = sorted(
|
||||
{r["operation"] for r in results if "ref_to_avx2" in r["comparisons"]},
|
||||
key=lambda op: -max(
|
||||
r["comparisons"]["ref_to_avx2"]["cliffs_delta"]
|
||||
for r in results
|
||||
if r["operation"] == op and "ref_to_avx2" in r["comparisons"]
|
||||
),
|
||||
)
|
||||
short_ops = [OP_SHORT.get(op, op) for op in ops_set]
|
||||
|
||||
data = np.full((len(ALGORITHMS), len(ops_set)), np.nan)
|
||||
for i, alg in enumerate(ALGORITHMS):
|
||||
for j, op in enumerate(ops_set):
|
||||
match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
|
||||
if match and "ref_to_avx2" in match[0]["comparisons"]:
|
||||
data[i, j] = match[0]["comparisons"]["ref_to_avx2"]["cliffs_delta"]
|
||||
|
||||
n_ops = len(ops_set)
|
||||
fig, ax = plt.subplots(figsize=(max(10, n_ops * 0.85), 3.2))
|
||||
im = ax.imshow(data, aspect="auto", cmap="RdYlGn", vmin=-1, vmax=1)
|
||||
plt.colorbar(im, ax=ax, label="Cliff's δ", fraction=0.03, pad=0.02)
|
||||
|
||||
ax.set_yticks(range(len(ALGORITHMS)))
|
||||
ax.set_yticklabels([ALG_TITLES[a] for a in ALGORITHMS], fontsize=10)
|
||||
ax.set_xticks(range(n_ops))
|
||||
ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9)
|
||||
ax.set_title(
|
||||
"Cliff's δ (ref vs. avx2) δ = +1.00: avx2 strictly faster in every observation pair",
|
||||
fontsize=10,
|
||||
)
|
||||
|
||||
for i in range(len(ALGORITHMS)):
|
||||
for j in range(n_ops):
|
||||
if not np.isnan(data[i, j]):
|
||||
# White text on dark green cells, black elsewhere
|
||||
text_color = "white" if data[i, j] > 0.85 else "black"
|
||||
ax.text(j, i, f"{data[i, j]:+.3f}", ha="center", va="center",
|
||||
fontsize=9, color=text_color, fontweight="bold")
|
||||
|
||||
fig.tight_layout()
|
||||
_save(fig, out_dir, "cliffs_delta_heatmap")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Figure 4: cycle distribution overlays (requires raw aggregator JSON)
|
||||
#
|
||||
# Three panels: one high-speedup op, one medium, one low.
|
||||
# Each panel overlays ref and avx2 histograms + KDE for mlkem512.
|
||||
# Log x-axis exposes the scale difference honestly.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fig_distributions(raw_records: list[dict], out_dir: Path, alg: str = "mlkem512") -> None:
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
# Build lookup: (alg, variant, op) → raw array
|
||||
raw: dict[tuple, np.ndarray] = {}
|
||||
for r in raw_records:
|
||||
if r.get("raw"):
|
||||
raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64)
|
||||
|
||||
n_ops = len(DIST_OPS)
|
||||
fig, axes = plt.subplots(1, n_ops, figsize=(5 * n_ops, 4))
|
||||
|
||||
variant_style = {
|
||||
"ref": {"color": "#4C72B0", "label": "ref (O3)", "alpha": 0.55, "zorder": 2},
|
||||
"avx2": {"color": "#C44E52", "label": "avx2", "alpha": 0.65, "zorder": 3},
|
||||
}
|
||||
|
||||
for ax, (op, subtitle) in zip(axes, DIST_OPS):
|
||||
plotted_any = False
|
||||
for variant in ("ref", "avx2"):
|
||||
arr = raw.get((alg, variant, op))
|
||||
if arr is None:
|
||||
continue
|
||||
plotted_any = True
|
||||
s = variant_style[variant]
|
||||
|
||||
# Histogram on log scale
|
||||
log_arr = np.log10(arr)
|
||||
lo, hi = np.floor(log_arr.min()), np.ceil(log_arr.max())
|
||||
bins = np.logspace(lo, hi, 60)
|
||||
ax.hist(arr, bins=bins, density=True, color=s["color"],
|
||||
alpha=s["alpha"], zorder=s["zorder"], label=s["label"])
|
||||
|
||||
# KDE on log scale, back-transformed
|
||||
kde = gaussian_kde(log_arr, bw_method=0.25)
|
||||
xs_log = np.linspace(lo, hi, 400)
|
||||
xs = 10 ** xs_log
|
||||
# KDE is in log space; convert density: p(x) = p(log x) / (x ln10)
|
||||
ys = kde(xs_log) / (xs * np.log(10))
|
||||
ax.plot(xs, ys, color=s["color"], linewidth=1.8, zorder=s["zorder"] + 1)
|
||||
|
||||
# Median line
|
||||
med = float(np.median(arr))
|
||||
ax.axvline(med, color=s["color"], linewidth=1.2, linestyle="--", zorder=5)
|
||||
|
||||
if not plotted_any:
|
||||
ax.set_visible(False)
|
||||
continue
|
||||
|
||||
ax.set_xscale("log")
|
||||
ax.set_xlabel("Cycles (log scale)")
|
||||
ax.set_ylabel("Density" if op == DIST_OPS[0][0] else "")
|
||||
ax.set_title(subtitle, fontsize=10)
|
||||
ax.legend(fontsize=9)
|
||||
ax.xaxis.set_major_formatter(ticker.LogFormatterSciNotation(labelOnlyBase=False))
|
||||
ax.grid(axis="x", which="both", linestyle="--", linewidth=0.4, alpha=0.4)
|
||||
ax.set_axisbelow(True)
|
||||
|
||||
fig.suptitle(
|
||||
f"Cycle Count Distributions — ref vs. avx2 ({ALG_TITLES[alg]})\n"
|
||||
"Dashed lines show medians. Distributions are right-skewed → nonparametric statistics.",
|
||||
fontsize=10,
|
||||
)
|
||||
fig.tight_layout()
|
||||
_save(fig, out_dir, "distributions")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Figure 5: cross-param speedup consistency
|
||||
#
|
||||
# For per-polynomial operations the polynomial dimension is always 256,
|
||||
# independent of the security parameter k. Speedups should be identical
|
||||
# across mlkem512/768/1024. This figure verifies that.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fig_cross_param(results: list[dict], out_dir: Path) -> None:
|
||||
ops = CROSS_PARAM_OPS
|
||||
short = [OP_SHORT.get(op, op) for op in ops]
|
||||
x = np.arange(len(ops))
|
||||
bar_w = 0.22
|
||||
offsets = np.array([-bar_w, 0, bar_w])
|
||||
colors = ["#4C72B0", "#55A868", "#C44E52"]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, 4))
|
||||
|
||||
for alg, offset, color in zip(ALGORITHMS, offsets, colors):
|
||||
vals, ci_lo, ci_hi = [], [], []
|
||||
for op in ops:
|
||||
match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
|
||||
if match and "ref_to_avx2" in match[0]["comparisons"]:
|
||||
c = match[0]["comparisons"]["ref_to_avx2"]
|
||||
vals.append(c["speedup"])
|
||||
ci_lo.append(c["ci95"][0])
|
||||
ci_hi.append(c["ci95"][1])
|
||||
else:
|
||||
vals.append(0); ci_lo.append(0); ci_hi.append(0)
|
||||
|
||||
vals = np.array(vals)
|
||||
ci_lo = np.array(ci_lo)
|
||||
ci_hi = np.array(ci_hi)
|
||||
yerr = np.array([vals - ci_lo, ci_hi - vals])
|
||||
mask = vals > 0
|
||||
|
||||
ax.bar(x[mask] + offset, vals[mask], bar_w,
|
||||
label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3)
|
||||
ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
|
||||
fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4)
|
||||
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(short, fontsize=11)
|
||||
ax.set_ylabel("Speedup ref → avx2 (×)")
|
||||
ax.set_title(
|
||||
"Per-Polynomial Operation Speedup Across Security Parameters\n"
|
||||
"(polynomial dim = 256 for all; NTT variation attributed to cache-state differences)"
|
||||
)
|
||||
ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
|
||||
ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
|
||||
ax.set_axisbelow(True)
|
||||
ax.legend(fontsize=10)
|
||||
|
||||
fig.tight_layout()
|
||||
_save(fig, out_dir, "cross_param")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Figure S1: KEM-level end-to-end speedup (supplementary)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fig_kem_level(results: list[dict], out_dir: Path) -> None:
|
||||
ops = KEM_OPS
|
||||
short = [KEM_SHORT[op] for op in ops]
|
||||
x = np.arange(len(ops))
|
||||
bar_w = 0.22
|
||||
offsets = np.array([-bar_w, 0, bar_w])
|
||||
colors = ["#4C72B0", "#55A868", "#C44E52"]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(7, 4))
|
||||
|
||||
for alg, offset, color in zip(ALGORITHMS, offsets, colors):
|
||||
vals, ci_lo, ci_hi = [], [], []
|
||||
for op in ops:
|
||||
match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
|
||||
if match and "ref_to_avx2" in match[0]["comparisons"]:
|
||||
c = match[0]["comparisons"]["ref_to_avx2"]
|
||||
vals.append(c["speedup"])
|
||||
ci_lo.append(c["ci95"][0])
|
||||
ci_hi.append(c["ci95"][1])
|
||||
else:
|
||||
vals.append(0); ci_lo.append(0); ci_hi.append(0)
|
||||
|
||||
vals = np.array(vals)
|
||||
ci_lo = np.array(ci_lo)
|
||||
ci_hi = np.array(ci_hi)
|
||||
yerr = np.array([vals - ci_lo, ci_hi - vals])
|
||||
mask = vals > 0
|
||||
|
||||
ax.bar(x[mask] + offset, vals[mask], bar_w,
|
||||
label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3)
|
||||
ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
|
||||
fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4)
|
||||
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(short, fontsize=12)
|
||||
ax.set_ylabel("Speedup ref → avx2 (×)")
|
||||
ax.set_title(
|
||||
"End-to-End KEM Speedup (ref → avx2)\n"
|
||||
"(Intel Xeon Platinum 8268, 95% bootstrap CI)"
|
||||
)
|
||||
ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
|
||||
ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
|
||||
ax.set_axisbelow(True)
|
||||
ax.legend(fontsize=10)
|
||||
|
||||
fig.tight_layout()
|
||||
_save(fig, out_dir, "kem_level")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shared save helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _save(fig: plt.Figure, out_dir: Path, stem: str) -> None:
|
||||
fig.savefig(out_dir / f"{stem}.pdf", bbox_inches="tight")
|
||||
fig.savefig(out_dir / f"{stem}.png", bbox_inches="tight", dpi=150)
|
||||
print(f"Saved {out_dir}/{stem}.{{pdf,png}}")
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--json", default="analysis/results.json",
|
||||
help="analyzed results JSON (from analyze.py)")
|
||||
parser.add_argument("--raw-json", default=None,
|
||||
help="raw aggregator JSON (from aggregate --raw); required for --distributions")
|
||||
parser.add_argument("--out", default="analysis/figures")
|
||||
args = parser.parse_args()
|
||||
|
||||
out_dir = Path(args.out)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results = load(args.json)
|
||||
print(f"Loaded {len(results)} result rows.")
|
||||
|
||||
fig_decomposition(results, out_dir)
|
||||
fig_hand_simd(results, out_dir)
|
||||
fig_cliffs_heatmap(results, out_dir)
|
||||
fig_cross_param(results, out_dir)
|
||||
fig_kem_level(results, out_dir)
|
||||
|
||||
if args.raw_json:
|
||||
raw_records = load(args.raw_json)
|
||||
print(f"Loaded {len(raw_records)} raw groups for distributions.")
|
||||
fig_distributions(raw_records, out_dir)
|
||||
else:
|
||||
print("Skipping distributions figure (pass --raw-json to enable).")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 94 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 58 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 122 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 116 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 102 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 52 KiB |
|
|
@ -0,0 +1,3 @@
|
|||
module git.levineuwirth.org/neuwirth/where-simd-helps/analysis
|
||||
|
||||
go 1.26.1
|
||||
|
|
@ -0,0 +1,189 @@
|
|||
// Package parse reads pqc-bench .out files produced by the SLURM harness.
|
||||
//
|
||||
// Each file contains a SLURM prolog header followed by 1–N "loop spin" blocks.
|
||||
// Each spin block reports one median+average pair per benchmarked operation.
|
||||
package parse
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Meta holds the SLURM prolog metadata extracted from the file header.
|
||||
type Meta struct {
|
||||
JobID string
|
||||
JobName string
|
||||
Node string
|
||||
StartedAt string
|
||||
Directory string
|
||||
// Explicit fields emitted by submit.sh for reliable downstream parsing.
|
||||
BenchVariant string
|
||||
BenchParam string
|
||||
BenchNSpins string
|
||||
}
|
||||
|
||||
// Measurement is a single operation's reported statistics for one loop spin.
|
||||
type Measurement struct {
|
||||
Median int64
|
||||
Average int64
|
||||
}
|
||||
|
||||
// Run holds everything parsed from one .out file.
|
||||
type Run struct {
|
||||
File string
|
||||
Meta Meta
|
||||
// Spins[i] maps operation name → measurement for loop spin i+1.
|
||||
Spins []map[string]Measurement
|
||||
}
|
||||
|
||||
// ParseFile reads a single .out file and returns a Run.
|
||||
func ParseFile(path string) (*Run, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
run := &Run{File: path}
|
||||
scanner := bufio.NewScanner(f)
|
||||
// Default buffer size is 64KB; lines are short so this is fine.
|
||||
|
||||
var currentSpin map[string]Measurement
|
||||
var currentOp string
|
||||
var pendingMedian int64
|
||||
inSpin := false
|
||||
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
|
||||
// SLURM prolog lines start with ##
|
||||
if strings.HasPrefix(line, "##") {
|
||||
parsePrologLine(line, &run.Meta)
|
||||
continue
|
||||
}
|
||||
|
||||
// New loop spin
|
||||
if strings.HasPrefix(line, "Loop spin:") {
|
||||
if inSpin && currentSpin != nil {
|
||||
run.Spins = append(run.Spins, currentSpin)
|
||||
}
|
||||
currentSpin = make(map[string]Measurement)
|
||||
currentOp = ""
|
||||
inSpin = true
|
||||
continue
|
||||
}
|
||||
|
||||
if !inSpin {
|
||||
continue
|
||||
}
|
||||
|
||||
// Operation name line ends with ':'
|
||||
if strings.HasSuffix(line, ":") && !strings.HasPrefix(line, "median") && !strings.HasPrefix(line, "average") {
|
||||
currentOp = strings.TrimSuffix(line, ":")
|
||||
currentOp = strings.TrimSpace(currentOp)
|
||||
continue
|
||||
}
|
||||
|
||||
if currentOp == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(line, "median:") {
|
||||
v, err := parseCycles(line)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%s: %w", path, err)
|
||||
}
|
||||
pendingMedian = v
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(line, "average:") {
|
||||
avg, err := parseCycles(line)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%s: %w", path, err)
|
||||
}
|
||||
currentSpin[currentOp] = Measurement{Median: pendingMedian, Average: avg}
|
||||
currentOp = ""
|
||||
pendingMedian = 0
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("%s: %w", path, err)
|
||||
}
|
||||
|
||||
// Flush last spin
|
||||
if inSpin && currentSpin != nil {
|
||||
run.Spins = append(run.Spins, currentSpin)
|
||||
}
|
||||
|
||||
return run, nil
|
||||
}
|
||||
|
||||
// parseCycles extracts the integer from lines like "median: 25194 cycles/ticks".
|
||||
func parseCycles(line string) (int64, error) {
|
||||
// Format: "<label>: <N> cycles/ticks"
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) < 2 {
|
||||
return 0, fmt.Errorf("unexpected line format: %q", line)
|
||||
}
|
||||
return strconv.ParseInt(parts[1], 10, 64)
|
||||
}
|
||||
|
||||
// parsePrologLine extracts key/value pairs from SLURM header lines.
|
||||
func parsePrologLine(line string, meta *Meta) {
|
||||
// Lines look like: "## Job ID : 11233228"
|
||||
// Strip leading "##" and optional decoration lines ("####...")
|
||||
trimmed := strings.TrimLeft(line, "#")
|
||||
trimmed = strings.TrimSpace(trimmed)
|
||||
key, val, ok := strings.Cut(trimmed, ":")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
key = strings.TrimSpace(key)
|
||||
val = strings.TrimSpace(val)
|
||||
|
||||
switch key {
|
||||
case "Job ID":
|
||||
meta.JobID = val
|
||||
case "Job Name":
|
||||
meta.JobName = val
|
||||
case "Nodelist":
|
||||
meta.Node = val
|
||||
case "Job Started":
|
||||
meta.StartedAt = val
|
||||
case "Directory":
|
||||
meta.Directory = val
|
||||
case "BENCH_VARIANT":
|
||||
meta.BenchVariant = val
|
||||
case "BENCH_PARAM":
|
||||
meta.BenchParam = val
|
||||
case "BENCH_NSPINS":
|
||||
meta.BenchNSpins = val
|
||||
}
|
||||
}
|
||||
|
||||
// InferVariant returns the benchmark variant for a run.
|
||||
//
|
||||
// Priority:
|
||||
// 1. Explicit BENCH_VARIANT metadata emitted by submit.sh (most reliable).
|
||||
// 2. The path segment immediately following "kyber/" in the SLURM Directory
|
||||
// field (works for old-style runs that ran from inside the kyber tree).
|
||||
// 3. "unknown" if neither is available.
|
||||
func InferVariant(meta Meta) string {
|
||||
if meta.BenchVariant != "" {
|
||||
return meta.BenchVariant
|
||||
}
|
||||
const marker = "kyber/"
|
||||
idx := strings.LastIndex(meta.Directory, marker)
|
||||
if idx < 0 {
|
||||
return "unknown"
|
||||
}
|
||||
rest := meta.Directory[idx+len(marker):]
|
||||
variant, _, _ := strings.Cut(rest, "/")
|
||||
return variant
|
||||
}
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
// Package stats computes summary statistics over slices of cycle counts.
|
||||
package stats
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"math"
|
||||
"math/rand/v2"
|
||||
"slices"
|
||||
)
|
||||
|
||||
const bootstrapN = 10_000
|
||||
|
||||
// Summary holds all computed statistics for one (algorithm, variant, operation) group.
|
||||
type Summary struct {
|
||||
N int
|
||||
Mean float64
|
||||
// Median is the sample median (p50).
|
||||
Median float64
|
||||
Std float64
|
||||
MAD float64
|
||||
P5 float64
|
||||
P25 float64
|
||||
P75 float64
|
||||
P95 float64
|
||||
P99 float64
|
||||
// CI95 is the bootstrapped 95% confidence interval for the median.
|
||||
CI95 [2]float64
|
||||
}
|
||||
|
||||
// Compute derives all statistics from a sorted (ascending) slice of values.
|
||||
// The caller must sort the slice before passing it in.
|
||||
func Compute(sorted []int64) Summary {
|
||||
n := len(sorted)
|
||||
if n == 0 {
|
||||
return Summary{}
|
||||
}
|
||||
|
||||
s := Summary{N: n}
|
||||
s.Mean = mean(sorted)
|
||||
s.Median = percentileFromSorted(sorted, 50)
|
||||
s.Std = stddev(sorted, s.Mean)
|
||||
s.MAD = mad(sorted, s.Median)
|
||||
s.P5 = percentileFromSorted(sorted, 5)
|
||||
s.P25 = percentileFromSorted(sorted, 25)
|
||||
s.P75 = percentileFromSorted(sorted, 75)
|
||||
s.P95 = percentileFromSorted(sorted, 95)
|
||||
s.P99 = percentileFromSorted(sorted, 99)
|
||||
s.CI95 = bootstrapMedianCI(sorted, bootstrapN)
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func mean(xs []int64) float64 {
|
||||
var sum float64
|
||||
for _, x := range xs {
|
||||
sum += float64(x)
|
||||
}
|
||||
return sum / float64(len(xs))
|
||||
}
|
||||
|
||||
func stddev(xs []int64, m float64) float64 {
|
||||
var variance float64
|
||||
for _, x := range xs {
|
||||
d := float64(x) - m
|
||||
variance += d * d
|
||||
}
|
||||
return math.Sqrt(variance / float64(len(xs)))
|
||||
}
|
||||
|
||||
func mad(sorted []int64, median float64) float64 {
|
||||
devs := make([]float64, len(sorted))
|
||||
for i, x := range sorted {
|
||||
devs[i] = math.Abs(float64(x) - median)
|
||||
}
|
||||
slices.Sort(devs)
|
||||
n := len(devs)
|
||||
if n%2 == 0 {
|
||||
return (devs[n/2-1] + devs[n/2]) / 2
|
||||
}
|
||||
return devs[n/2]
|
||||
}
|
||||
|
||||
// percentileFromSorted uses linear interpolation (same as numpy's default).
|
||||
func percentileFromSorted(sorted []int64, p float64) float64 {
|
||||
n := float64(len(sorted))
|
||||
if n == 1 {
|
||||
return float64(sorted[0])
|
||||
}
|
||||
rank := p / 100 * (n - 1)
|
||||
lo := int(math.Floor(rank))
|
||||
hi := int(math.Ceil(rank))
|
||||
frac := rank - float64(lo)
|
||||
return float64(sorted[lo])*(1-frac) + float64(sorted[hi])*frac
|
||||
}
|
||||
|
||||
// bootstrapMedianCI resamples the data bootstrapN times and returns the
|
||||
// [2.5th, 97.5th] percentile of the bootstrap median distribution.
|
||||
func bootstrapMedianCI(sorted []int64, iters int) [2]float64 {
|
||||
n := len(sorted)
|
||||
buf := make([]int64, n)
|
||||
medians := make([]float64, iters)
|
||||
|
||||
for i := range iters {
|
||||
for j := range n {
|
||||
buf[j] = sorted[rand.IntN(n)]
|
||||
}
|
||||
slices.Sort(buf)
|
||||
medians[i] = percentileFromSorted(buf, 50)
|
||||
}
|
||||
|
||||
slices.Sort(medians)
|
||||
return [2]float64{
|
||||
percentile64(medians, 2.5),
|
||||
percentile64(medians, 97.5),
|
||||
}
|
||||
}
|
||||
|
||||
func percentile64(sorted []float64, p float64) float64 {
|
||||
n := float64(len(sorted))
|
||||
if n == 1 {
|
||||
return sorted[0]
|
||||
}
|
||||
rank := p / 100 * (n - 1)
|
||||
lo := int(math.Floor(rank))
|
||||
hi := int(math.Ceil(rank))
|
||||
frac := rank - float64(lo)
|
||||
return sorted[lo]*(1-frac) + sorted[hi]*frac
|
||||
}
|
||||
|
||||
// SortInt64 sorts a slice of int64 in place (ascending).
|
||||
func SortInt64(xs []int64) {
|
||||
slices.SortFunc(xs, cmp.Compare)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,19 @@
|
|||
## SLURM PROLOG ###############################################################
|
||||
## Job ID : 1179894
|
||||
## Job Name : bench_mlkem1024_avx2
|
||||
## Nodelist : node2334
|
||||
## CPUs : 1
|
||||
## Mem/Node : 256 MB
|
||||
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
|
||||
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
|
||||
###############################################################################
|
||||
pid 1627591's current affinity list: 41
|
||||
## BENCH_VARIANT : avx2
|
||||
## BENCH_PARAM : 1024
|
||||
## BENCH_NSPINS : 1000
|
||||
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem1024_avx2
|
||||
## BENCH_DATE : 2026-04-02T12:18:20-04:00
|
||||
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
|
||||
## PERF_PARANOID : 2
|
||||
## PAPI_BUILD : OFF
|
||||
ERROR: binary not found or not executable:
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,19 @@
|
|||
## SLURM PROLOG ###############################################################
|
||||
## Job ID : 1179893
|
||||
## Job Name : bench_mlkem1024_ref
|
||||
## Nodelist : node2334
|
||||
## CPUs : 1
|
||||
## Mem/Node : 256 MB
|
||||
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
|
||||
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
|
||||
###############################################################################
|
||||
pid 1627590's current affinity list: 40
|
||||
## BENCH_VARIANT : ref
|
||||
## BENCH_PARAM : 1024
|
||||
## BENCH_NSPINS : 1000
|
||||
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem1024_ref
|
||||
## BENCH_DATE : 2026-04-02T12:18:20-04:00
|
||||
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
|
||||
## PERF_PARANOID : 2
|
||||
## PAPI_BUILD : OFF
|
||||
ERROR: binary not found or not executable:
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,19 @@
|
|||
## SLURM PROLOG ###############################################################
|
||||
## Job ID : 1179890
|
||||
## Job Name : bench_mlkem512_avx2
|
||||
## Nodelist : node2333
|
||||
## CPUs : 1
|
||||
## Mem/Node : 256 MB
|
||||
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
|
||||
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
|
||||
###############################################################################
|
||||
pid 2240632's current affinity list: 40
|
||||
## BENCH_VARIANT : avx2
|
||||
## BENCH_PARAM : 512
|
||||
## BENCH_NSPINS : 1000
|
||||
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem512_avx2
|
||||
## BENCH_DATE : 2026-04-02T12:18:20-04:00
|
||||
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
|
||||
## PERF_PARANOID : 2
|
||||
## PAPI_BUILD : OFF
|
||||
ERROR: binary not found or not executable:
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,19 @@
|
|||
## SLURM PROLOG ###############################################################
|
||||
## Job ID : 1179889
|
||||
## Job Name : bench_mlkem512_ref
|
||||
## Nodelist : node2333
|
||||
## CPUs : 1
|
||||
## Mem/Node : 256 MB
|
||||
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
|
||||
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
|
||||
###############################################################################
|
||||
pid 2240630's current affinity list: 39
|
||||
## BENCH_VARIANT : ref
|
||||
## BENCH_PARAM : 512
|
||||
## BENCH_NSPINS : 1000
|
||||
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem512_ref
|
||||
## BENCH_DATE : 2026-04-02T12:18:20-04:00
|
||||
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
|
||||
## PERF_PARANOID : 2
|
||||
## PAPI_BUILD : OFF
|
||||
ERROR: binary not found or not executable:
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,19 @@
|
|||
## SLURM PROLOG ###############################################################
|
||||
## Job ID : 1179892
|
||||
## Job Name : bench_mlkem768_avx2
|
||||
## Nodelist : node2334
|
||||
## CPUs : 1
|
||||
## Mem/Node : 256 MB
|
||||
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
|
||||
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
|
||||
###############################################################################
|
||||
pid 1627592's current affinity list: 32
|
||||
## BENCH_VARIANT : avx2
|
||||
## BENCH_PARAM : 768
|
||||
## BENCH_NSPINS : 1000
|
||||
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem768_avx2
|
||||
## BENCH_DATE : 2026-04-02T12:18:20-04:00
|
||||
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
|
||||
## PERF_PARANOID : 2
|
||||
## PAPI_BUILD : OFF
|
||||
ERROR: binary not found or not executable:
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,19 @@
|
|||
## SLURM PROLOG ###############################################################
|
||||
## Job ID : 1179891
|
||||
## Job Name : bench_mlkem768_ref
|
||||
## Nodelist : node2333
|
||||
## CPUs : 1
|
||||
## Mem/Node : 256 MB
|
||||
## Directory : /oscar/data/lshu/lneuwirt/where-simd-helps/slurm
|
||||
## Job Started : Thu Apr 2 12:18:20 PM EDT 2026
|
||||
###############################################################################
|
||||
pid 2240631's current affinity list: 42
|
||||
## BENCH_VARIANT : ref
|
||||
## BENCH_PARAM : 768
|
||||
## BENCH_NSPINS : 1000
|
||||
## BENCH_BINARY : /users/lneuwirt/data/lneuwirt/where-simd-helps/harness/build-hpc/bench_mlkem768_ref
|
||||
## BENCH_DATE : 2026-04-02T12:18:20-04:00
|
||||
## CPU_MODEL : Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
|
||||
## PERF_PARANOID : 2
|
||||
## PAPI_BUILD : OFF
|
||||
ERROR: binary not found or not executable:
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,57 +1,157 @@
|
|||
cmake_minimum_required(VERSION 3.20)
|
||||
project(pqc-bench C)
|
||||
project(pqc-bench C ASM)
|
||||
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
|
||||
# ── Compiler flags ──────────────────────────────────────────────────────────
|
||||
# Release build with full optimization; override on the command line:
|
||||
# cmake -DCMAKE_BUILD_TYPE=Debug ..
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif()
|
||||
|
||||
set(CMAKE_C_FLAGS_RELEASE "-O3 -march=native")
|
||||
set(KYBER_ROOT ${CMAKE_SOURCE_DIR}/../algorithms/kyber)
|
||||
|
||||
# ── Algorithm root (submodule) ───────────────────────────────────────────────
|
||||
# Each target below compiles a variant of test_speed.c against a specific
|
||||
# algorithm build. Add algorithm libraries as submodule CMake subdirectories
|
||||
# or via add_library() here as the project grows.
|
||||
#
|
||||
# Example layout once kyber submodule is added:
|
||||
# algorithms/kyber/ref/ → static lib kyber512_ref, kyber768_ref, kyber1024_ref
|
||||
# algorithms/kyber/avx2/ → static lib kyber512_avx2, ...
|
||||
# ── Helpers shared across variants ──────────────────────────────────────────
|
||||
# cpucycles / speed_print live in the kyber ref test dir; both variants use
|
||||
# the same copies (avx2/test/ has identical files).
|
||||
set(BENCH_HELPERS
|
||||
${KYBER_ROOT}/ref/test/cpucycles.c
|
||||
${KYBER_ROOT}/ref/test/speed_print.c
|
||||
)
|
||||
|
||||
# ── Harness source ───────────────────────────────────────────────────────────
|
||||
set(HARNESS_SRC src/test_speed.c)
|
||||
set(HARNESS_SRC ${CMAKE_SOURCE_DIR}/src/test_speed.c)
|
||||
|
||||
# ── Build variants ───────────────────────────────────────────────────────────
|
||||
# Uncomment and adjust as algorithm libraries become available.
|
||||
#
|
||||
# foreach(PARAM 512 768 1024)
|
||||
# foreach(VARIANT ref refnv)
|
||||
# set(TARGET "bench_mlkem${PARAM}_${VARIANT}")
|
||||
# add_executable(${TARGET} ${HARNESS_SRC})
|
||||
# target_include_directories(${TARGET} PRIVATE
|
||||
# ${CMAKE_SOURCE_DIR}/../algorithms/kyber/${VARIANT})
|
||||
# target_link_libraries(${TARGET} kyber${PARAM}_${VARIANT})
|
||||
# target_compile_definitions(${TARGET} PRIVATE KYBER_K=${PARAM})
|
||||
# endforeach()
|
||||
# endforeach()
|
||||
# ── ref sources (pure C, portable) ──────────────────────────────────────────
|
||||
set(REF_DIR ${KYBER_ROOT}/ref)
|
||||
set(REF_SOURCES
|
||||
${REF_DIR}/kem.c
|
||||
${REF_DIR}/indcpa.c
|
||||
${REF_DIR}/polyvec.c
|
||||
${REF_DIR}/poly.c
|
||||
${REF_DIR}/ntt.c
|
||||
${REF_DIR}/cbd.c
|
||||
${REF_DIR}/reduce.c
|
||||
${REF_DIR}/verify.c
|
||||
${REF_DIR}/fips202.c
|
||||
${REF_DIR}/symmetric-shake.c
|
||||
${REF_DIR}/randombytes.c
|
||||
)
|
||||
|
||||
# ── avx2 sources (C + x86 assembly) ─────────────────────────────────────────
|
||||
set(AVX2_DIR ${KYBER_ROOT}/avx2)
|
||||
set(AVX2_SOURCES
|
||||
${AVX2_DIR}/kem.c
|
||||
${AVX2_DIR}/indcpa.c
|
||||
${AVX2_DIR}/polyvec.c
|
||||
${AVX2_DIR}/poly.c
|
||||
${AVX2_DIR}/cbd.c
|
||||
${AVX2_DIR}/verify.c
|
||||
${AVX2_DIR}/fips202.c
|
||||
${AVX2_DIR}/fips202x4.c
|
||||
${AVX2_DIR}/symmetric-shake.c
|
||||
${AVX2_DIR}/randombytes.c
|
||||
${AVX2_DIR}/consts.c
|
||||
${AVX2_DIR}/rejsample.c
|
||||
${AVX2_DIR}/fq.S
|
||||
${AVX2_DIR}/shuffle.S
|
||||
${AVX2_DIR}/ntt.S
|
||||
${AVX2_DIR}/invntt.S
|
||||
${AVX2_DIR}/basemul.S
|
||||
${AVX2_DIR}/keccak4x/KeccakP-1600-times4-SIMD256.c
|
||||
)
|
||||
|
||||
# ── KYBER_K mapping ──────────────────────────────────────────────────────────
|
||||
# 512 → K=2, 768 → K=3, 1024 → K=4
|
||||
set(KYBER_K_512 2)
|
||||
set(KYBER_K_768 3)
|
||||
set(KYBER_K_1024 4)
|
||||
|
||||
# ── Build targets ────────────────────────────────────────────────────────────
|
||||
foreach(LEVEL 512 768 1024)
|
||||
set(K ${KYBER_K_${LEVEL}})
|
||||
|
||||
# ref — optimised reference (O3, auto-vectorisation enabled)
|
||||
set(REF_TARGET bench_mlkem${LEVEL}_ref)
|
||||
add_executable(${REF_TARGET}
|
||||
${HARNESS_SRC}
|
||||
${REF_SOURCES}
|
||||
${BENCH_HELPERS}
|
||||
)
|
||||
target_include_directories(${REF_TARGET} PRIVATE
|
||||
${REF_DIR}
|
||||
${REF_DIR}/test
|
||||
)
|
||||
target_compile_definitions(${REF_TARGET} PRIVATE KYBER_K=${K})
|
||||
target_compile_options(${REF_TARGET} PRIVATE -O3 -fomit-frame-pointer)
|
||||
|
||||
# refnv — ref with auto-vectorisation disabled; isolates scalar O3 performance
|
||||
set(REFNV_TARGET bench_mlkem${LEVEL}_refnv)
|
||||
add_executable(${REFNV_TARGET}
|
||||
${HARNESS_SRC}
|
||||
${REF_SOURCES}
|
||||
${BENCH_HELPERS}
|
||||
)
|
||||
target_include_directories(${REFNV_TARGET} PRIVATE
|
||||
${REF_DIR}
|
||||
${REF_DIR}/test
|
||||
)
|
||||
target_compile_definitions(${REFNV_TARGET} PRIVATE KYBER_K=${K})
|
||||
target_compile_options(${REFNV_TARGET} PRIVATE
|
||||
-O3 -fomit-frame-pointer -fno-tree-vectorize
|
||||
)
|
||||
|
||||
# refo0 — ref at -O0; establishes unoptimised baseline
|
||||
set(REFO0_TARGET bench_mlkem${LEVEL}_refo0)
|
||||
add_executable(${REFO0_TARGET}
|
||||
${HARNESS_SRC}
|
||||
${REF_SOURCES}
|
||||
${BENCH_HELPERS}
|
||||
)
|
||||
target_include_directories(${REFO0_TARGET} PRIVATE
|
||||
${REF_DIR}
|
||||
${REF_DIR}/test
|
||||
)
|
||||
target_compile_definitions(${REFO0_TARGET} PRIVATE KYBER_K=${K})
|
||||
target_compile_options(${REFO0_TARGET} PRIVATE -O0)
|
||||
|
||||
# avx2 — hand-written AVX2 assembly + O3
|
||||
set(AVX2_TARGET bench_mlkem${LEVEL}_avx2)
|
||||
add_executable(${AVX2_TARGET}
|
||||
${HARNESS_SRC}
|
||||
${AVX2_SOURCES}
|
||||
${BENCH_HELPERS}
|
||||
)
|
||||
target_include_directories(${AVX2_TARGET} PRIVATE
|
||||
${AVX2_DIR}
|
||||
${AVX2_DIR}/test
|
||||
${AVX2_DIR}/keccak4x
|
||||
)
|
||||
target_compile_definitions(${AVX2_TARGET} PRIVATE KYBER_K=${K})
|
||||
target_compile_options(${AVX2_TARGET} PRIVATE
|
||||
-O3 -fomit-frame-pointer -mavx2 -mbmi2 -mpopcnt -march=native -mtune=native
|
||||
)
|
||||
endforeach()
|
||||
|
||||
# ── PAPI (hardware performance counters) ─────────────────────────────────────
|
||||
# Optional; enable with -DWITH_PAPI=ON
|
||||
option(WITH_PAPI "Link against PAPI for hardware counter collection" OFF)
|
||||
if(WITH_PAPI)
|
||||
find_library(PAPI_LIB papi REQUIRED)
|
||||
find_path(PAPI_INCLUDE papi.h REQUIRED)
|
||||
# Targets that need PAPI:
|
||||
# target_include_directories(<target> PRIVATE ${PAPI_INCLUDE})
|
||||
# target_link_libraries(<target> ${PAPI_LIB})
|
||||
foreach(LEVEL 512 768 1024)
|
||||
foreach(VARIANT ref refnv refo0 avx2)
|
||||
set(T bench_mlkem${LEVEL}_${VARIANT})
|
||||
target_include_directories(${T} PRIVATE ${PAPI_INCLUDE})
|
||||
target_link_libraries(${T} ${PAPI_LIB})
|
||||
target_compile_definitions(${T} PRIVATE WITH_PAPI)
|
||||
endforeach()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# ── RAPL energy measurement ──────────────────────────────────────────────────
|
||||
# Optional; enable with -DWITH_RAPL=ON (requires root or CAP_SYS_RAWIO)
|
||||
# Requires root or CAP_SYS_RAWIO on the benchmark node.
|
||||
option(WITH_RAPL "Enable Intel RAPL energy measurement" OFF)
|
||||
if(WITH_RAPL)
|
||||
# target_compile_definitions(<target> PRIVATE WITH_RAPL)
|
||||
foreach(LEVEL 512 768 1024)
|
||||
foreach(VARIANT ref refnv refo0 avx2)
|
||||
target_compile_definitions(bench_mlkem${LEVEL}_${VARIANT} PRIVATE WITH_RAPL)
|
||||
endforeach()
|
||||
endforeach()
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,394 @@
|
|||
# This is the CMakeCache file.
|
||||
# For build in directory: /home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
|
||||
# It was generated by CMake: /usr/bin/cmake
|
||||
# You can edit this file to change values found and used by cmake.
|
||||
# If you do not want to change any of the values, simply exit the editor.
|
||||
# If you do want to change a value, simply edit, save, and exit the editor.
|
||||
# The syntax for the file is as follows:
|
||||
# KEY:TYPE=VALUE
|
||||
# KEY is the name of a variable in the cache.
|
||||
# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
|
||||
# VALUE is the current value for the KEY.
|
||||
|
||||
########################
|
||||
# EXTERNAL cache entries
|
||||
########################
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_AR:FILEPATH=/usr/bin/ar
|
||||
|
||||
//ASM compiler
|
||||
CMAKE_ASM_COMPILER:FILEPATH=/usr/bin/cc
|
||||
|
||||
//A wrapper around 'ar' adding the appropriate '--plugin' option
|
||||
// for the GCC compiler
|
||||
CMAKE_ASM_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar
|
||||
|
||||
//A wrapper around 'ranlib' adding the appropriate '--plugin' option
|
||||
// for the GCC compiler
|
||||
CMAKE_ASM_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib
|
||||
|
||||
//Flags used by the ASM compiler during all build types.
|
||||
CMAKE_ASM_FLAGS:STRING=
|
||||
|
||||
//Flags used by the ASM compiler during DEBUG builds.
|
||||
CMAKE_ASM_FLAGS_DEBUG:STRING=-g
|
||||
|
||||
//Flags used by the ASM compiler during MINSIZEREL builds.
|
||||
CMAKE_ASM_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
|
||||
|
||||
//Flags used by the ASM compiler during RELEASE builds.
|
||||
CMAKE_ASM_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
|
||||
|
||||
//Flags used by the ASM compiler during RELWITHDEBINFO builds.
|
||||
CMAKE_ASM_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
|
||||
|
||||
//Choose the type of build, options are: None Debug Release RelWithDebInfo
|
||||
// MinSizeRel ...
|
||||
CMAKE_BUILD_TYPE:STRING=Release
|
||||
|
||||
//Enable/Disable color output during build.
|
||||
CMAKE_COLOR_MAKEFILE:BOOL=ON
|
||||
|
||||
//C compiler
|
||||
CMAKE_C_COMPILER:FILEPATH=/usr/bin/cc
|
||||
|
||||
//A wrapper around 'ar' adding the appropriate '--plugin' option
|
||||
// for the GCC compiler
|
||||
CMAKE_C_COMPILER_AR:FILEPATH=/usr/bin/gcc-ar
|
||||
|
||||
//A wrapper around 'ranlib' adding the appropriate '--plugin' option
|
||||
// for the GCC compiler
|
||||
CMAKE_C_COMPILER_RANLIB:FILEPATH=/usr/bin/gcc-ranlib
|
||||
|
||||
//Flags used by the C compiler during all build types.
|
||||
CMAKE_C_FLAGS:STRING=
|
||||
|
||||
//Flags used by the C compiler during DEBUG builds.
|
||||
CMAKE_C_FLAGS_DEBUG:STRING=-g
|
||||
|
||||
//Flags used by the C compiler during MINSIZEREL builds.
|
||||
CMAKE_C_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
|
||||
|
||||
//Flags used by the C compiler during RELEASE builds.
|
||||
CMAKE_C_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
|
||||
|
||||
//Flags used by the C compiler during RELWITHDEBINFO builds.
|
||||
CMAKE_C_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
|
||||
|
||||
//Flags used by the linker during all build types.
|
||||
CMAKE_EXE_LINKER_FLAGS:STRING=
|
||||
|
||||
//Flags used by the linker during DEBUG builds.
|
||||
CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
|
||||
|
||||
//Flags used by the linker during MINSIZEREL builds.
|
||||
CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
|
||||
|
||||
//Flags used by the linker during RELEASE builds.
|
||||
CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
|
||||
|
||||
//Flags used by the linker during RELWITHDEBINFO builds.
|
||||
CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
|
||||
|
||||
//Enable/Disable output of compile commands during generation.
|
||||
CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
|
||||
|
||||
//Value Computed by CMake.
|
||||
CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi/CMakeFiles/pkgRedirects
|
||||
|
||||
//Install path prefix, prepended onto install directories.
|
||||
CMAKE_INSTALL_PREFIX:PATH=/usr/local
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_LINKER:FILEPATH=/usr/bin/ld
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/make
|
||||
|
||||
//Flags used by the linker during the creation of modules during
|
||||
// all build types.
|
||||
CMAKE_MODULE_LINKER_FLAGS:STRING=
|
||||
|
||||
//Flags used by the linker during the creation of modules during
|
||||
// DEBUG builds.
|
||||
CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
|
||||
|
||||
//Flags used by the linker during the creation of modules during
|
||||
// MINSIZEREL builds.
|
||||
CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
|
||||
|
||||
//Flags used by the linker during the creation of modules during
|
||||
// RELEASE builds.
|
||||
CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
|
||||
|
||||
//Flags used by the linker during the creation of modules during
|
||||
// RELWITHDEBINFO builds.
|
||||
CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_NM:FILEPATH=/usr/bin/nm
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
|
||||
|
||||
//Value Computed by CMake
|
||||
CMAKE_PROJECT_COMPAT_VERSION:STATIC=
|
||||
|
||||
//Value Computed by CMake
|
||||
CMAKE_PROJECT_DESCRIPTION:STATIC=
|
||||
|
||||
//Value Computed by CMake
|
||||
CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
|
||||
|
||||
//Value Computed by CMake
|
||||
CMAKE_PROJECT_NAME:STATIC=pqc-bench
|
||||
|
||||
//Value Computed by CMake
|
||||
CMAKE_PROJECT_SPDX_LICENSE:STATIC=
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_READELF:FILEPATH=/usr/bin/readelf
|
||||
|
||||
//Flags used by the linker during the creation of shared libraries
|
||||
// during all build types.
|
||||
CMAKE_SHARED_LINKER_FLAGS:STRING=
|
||||
|
||||
//Flags used by the linker during the creation of shared libraries
|
||||
// during DEBUG builds.
|
||||
CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
|
||||
|
||||
//Flags used by the linker during the creation of shared libraries
|
||||
// during MINSIZEREL builds.
|
||||
CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
|
||||
|
||||
//Flags used by the linker during the creation of shared libraries
|
||||
// during RELEASE builds.
|
||||
CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
|
||||
|
||||
//Flags used by the linker during the creation of shared libraries
|
||||
// during RELWITHDEBINFO builds.
|
||||
CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
|
||||
|
||||
//If set, runtime paths are not added when installing shared libraries,
|
||||
// but are added when building.
|
||||
CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
|
||||
|
||||
//If set, runtime paths are not added when using shared libraries.
|
||||
CMAKE_SKIP_RPATH:BOOL=NO
|
||||
|
||||
//Flags used by the archiver during the creation of static libraries
|
||||
// during all build types.
|
||||
CMAKE_STATIC_LINKER_FLAGS:STRING=
|
||||
|
||||
//Flags used by the archiver during the creation of static libraries
|
||||
// during DEBUG builds.
|
||||
CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
|
||||
|
||||
//Flags used by the archiver during the creation of static libraries
|
||||
// during MINSIZEREL builds.
|
||||
CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
|
||||
|
||||
//Flags used by the archiver during the creation of static libraries
|
||||
// during RELEASE builds.
|
||||
CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
|
||||
|
||||
//Flags used by the archiver during the creation of static libraries
|
||||
// during RELWITHDEBINFO builds.
|
||||
CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_STRIP:FILEPATH=/usr/bin/strip
|
||||
|
||||
//Path to a program.
|
||||
CMAKE_TAPI:FILEPATH=CMAKE_TAPI-NOTFOUND
|
||||
|
||||
//If this value is on, makefiles will be generated without the
|
||||
// .SILENT directive, and all commands will be echoed to the console
|
||||
// during the make. This is useful for debugging only. With Visual
|
||||
// Studio IDE projects all commands are done without /nologo.
|
||||
CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
|
||||
|
||||
//Path to a library.
|
||||
PAPI_LIB:FILEPATH=PAPI_LIB-NOTFOUND
|
||||
|
||||
//Link against PAPI for hardware counter collection
|
||||
WITH_PAPI:BOOL=ON
|
||||
|
||||
//Value Computed by CMake
|
||||
pqc-bench_BINARY_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
|
||||
|
||||
//Value Computed by CMake
|
||||
pqc-bench_IS_TOP_LEVEL:STATIC=ON
|
||||
|
||||
//Value Computed by CMake
|
||||
pqc-bench_SOURCE_DIR:STATIC=/home/jeans/Repos/research/pqc/where-simd-helps/harness
|
||||
|
||||
|
||||
########################
|
||||
# INTERNAL cache entries
|
||||
########################
|
||||
|
||||
//ADVANCED property for variable: CMAKE_ADDR2LINE
|
||||
CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_AR
|
||||
CMAKE_AR-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_ASM_COMPILER
|
||||
CMAKE_ASM_COMPILER-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_ASM_COMPILER_AR
|
||||
CMAKE_ASM_COMPILER_AR-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_ASM_COMPILER_RANLIB
|
||||
CMAKE_ASM_COMPILER_RANLIB-ADVANCED:INTERNAL=1
|
||||
CMAKE_ASM_COMPILER_WORKS:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_ASM_FLAGS
|
||||
CMAKE_ASM_FLAGS-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_ASM_FLAGS_DEBUG
|
||||
CMAKE_ASM_FLAGS_DEBUG-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_ASM_FLAGS_MINSIZEREL
|
||||
CMAKE_ASM_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_ASM_FLAGS_RELEASE
|
||||
CMAKE_ASM_FLAGS_RELEASE-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_ASM_FLAGS_RELWITHDEBINFO
|
||||
CMAKE_ASM_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
|
||||
//This is the directory where this CMakeCache.txt was created
|
||||
CMAKE_CACHEFILE_DIR:INTERNAL=/home/jeans/Repos/research/pqc/where-simd-helps/harness/build-papi
|
||||
//Major version of cmake used to create the current loaded cache
|
||||
CMAKE_CACHE_MAJOR_VERSION:INTERNAL=4
|
||||
//Minor version of cmake used to create the current loaded cache
|
||||
CMAKE_CACHE_MINOR_VERSION:INTERNAL=3
|
||||
//Patch version of cmake used to create the current loaded cache
|
||||
CMAKE_CACHE_PATCH_VERSION:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
|
||||
CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
|
||||
//Path to CMake executable.
|
||||
CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
|
||||
//Path to cpack program executable.
|
||||
CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
|
||||
//Path to ctest program executable.
|
||||
CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
|
||||
//ADVANCED property for variable: CMAKE_C_COMPILER
|
||||
CMAKE_C_COMPILER-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_C_COMPILER_AR
|
||||
CMAKE_C_COMPILER_AR-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_C_COMPILER_RANLIB
|
||||
CMAKE_C_COMPILER_RANLIB-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_C_FLAGS
|
||||
CMAKE_C_FLAGS-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_C_FLAGS_DEBUG
|
||||
CMAKE_C_FLAGS_DEBUG-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_C_FLAGS_MINSIZEREL
|
||||
CMAKE_C_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_C_FLAGS_RELEASE
|
||||
CMAKE_C_FLAGS_RELEASE-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_C_FLAGS_RELWITHDEBINFO
|
||||
CMAKE_C_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_DLLTOOL
|
||||
CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
|
||||
//Path to cache edit program executable.
|
||||
CMAKE_EDIT_COMMAND:INTERNAL=/usr/bin/ccmake
|
||||
//Executable file format
|
||||
CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
|
||||
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
|
||||
CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
|
||||
CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
|
||||
CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
|
||||
CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
|
||||
CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
|
||||
CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
|
||||
//Name of external makefile project generator.
|
||||
CMAKE_EXTRA_GENERATOR:INTERNAL=
|
||||
//Name of generator.
|
||||
CMAKE_GENERATOR:INTERNAL=Unix Makefiles
|
||||
//Generator instance identifier.
|
||||
CMAKE_GENERATOR_INSTANCE:INTERNAL=
|
||||
//Name of generator platform.
|
||||
CMAKE_GENERATOR_PLATFORM:INTERNAL=
|
||||
//Name of generator toolset.
|
||||
CMAKE_GENERATOR_TOOLSET:INTERNAL=
|
||||
//Source directory with the top level CMakeLists.txt file for this
|
||||
// project
|
||||
CMAKE_HOME_DIRECTORY:INTERNAL=/home/jeans/Repos/research/pqc/where-simd-helps/harness
|
||||
//Install .so files without execute permission.
|
||||
CMAKE_INSTALL_SO_NO_EXE:INTERNAL=0
|
||||
//ADVANCED property for variable: CMAKE_LINKER
|
||||
CMAKE_LINKER-ADVANCED:INTERNAL=1
|
||||
//Name of CMakeLists files to read
|
||||
CMAKE_LIST_FILE_NAME:INTERNAL=CMakeLists.txt
|
||||
//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
|
||||
CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
|
||||
CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
|
||||
CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
|
||||
CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
|
||||
CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
|
||||
CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_NM
|
||||
CMAKE_NM-ADVANCED:INTERNAL=1
|
||||
//number of local generators
|
||||
CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_OBJCOPY
|
||||
CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_OBJDUMP
|
||||
CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
|
||||
//Platform information initialized
|
||||
CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_RANLIB
|
||||
CMAKE_RANLIB-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_READELF
|
||||
CMAKE_READELF-ADVANCED:INTERNAL=1
|
||||
//Path to CMake installation.
|
||||
CMAKE_ROOT:INTERNAL=/usr/share/cmake
|
||||
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
|
||||
CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
|
||||
CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
|
||||
CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
|
||||
CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
|
||||
CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
|
||||
CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_SKIP_RPATH
|
||||
CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
|
||||
CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
|
||||
CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
|
||||
CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
|
||||
CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
|
||||
CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_STRIP
|
||||
CMAKE_STRIP-ADVANCED:INTERNAL=1
|
||||
//ADVANCED property for variable: CMAKE_TAPI
|
||||
CMAKE_TAPI-ADVANCED:INTERNAL=1
|
||||
//uname command
|
||||
CMAKE_UNAME:INTERNAL=/usr/bin/uname
|
||||
//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
|
||||
CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
|
||||
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
set(CMAKE_ASM_COMPILER "/usr/bin/cc")
|
||||
set(CMAKE_ASM_COMPILER_ARG1 "")
|
||||
set(CMAKE_AR "/usr/bin/ar")
|
||||
set(CMAKE_ASM_COMPILER_AR "/usr/bin/gcc-ar")
|
||||
set(CMAKE_RANLIB "/usr/bin/ranlib")
|
||||
set(CMAKE_ASM_COMPILER_RANLIB "/usr/bin/gcc-ranlib")
|
||||
set(CMAKE_LINKER "/usr/bin/ld")
|
||||
set(CMAKE_LINKER_LINK "")
|
||||
set(CMAKE_LINKER_LLD "")
|
||||
set(CMAKE_ASM_COMPILER_LINKER "")
|
||||
set(CMAKE_ASM_COMPILER_LINKER_ID "")
|
||||
set(CMAKE_ASM_COMPILER_LINKER_VERSION )
|
||||
set(CMAKE_ASM_COMPILER_LINKER_FRONTEND_VARIANT )
|
||||
set(CMAKE_MT "")
|
||||
set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
|
||||
set(CMAKE_ASM_COMPILER_LOADED 1)
|
||||
set(CMAKE_ASM_COMPILER_ID "GNU")
|
||||
set(CMAKE_ASM_COMPILER_VERSION "")
|
||||
set(CMAKE_ASM_COMPILER_ENV_VAR "ASM")
|
||||
|
||||
set(CMAKE_ASM_COMPILER_ARCHITECTURE_ID "")
|
||||
|
||||
|
||||
set(CMAKE_ASM_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
|
||||
set(CMAKE_ASM_LINKER_PREFERENCE 0)
|
||||
set(CMAKE_ASM_LINKER_DEPFILE_SUPPORTED )
|
||||
set(CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
|
||||
set(CMAKE_ASM_LINKER_PUSHPOP_STATE_SUPPORTED )
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
set(CMAKE_C_COMPILER "/usr/bin/cc")
|
||||
set(CMAKE_C_COMPILER_ARG1 "")
|
||||
set(CMAKE_C_COMPILER_ID "GNU")
|
||||
set(CMAKE_C_COMPILER_VERSION "15.2.1")
|
||||
set(CMAKE_C_COMPILER_VERSION_INTERNAL "")
|
||||
set(CMAKE_C_COMPILER_WRAPPER "")
|
||||
set(CMAKE_C_STANDARD_COMPUTED_DEFAULT "23")
|
||||
set(CMAKE_C_EXTENSIONS_COMPUTED_DEFAULT "ON")
|
||||
set(CMAKE_C_STANDARD_LATEST "23")
|
||||
set(CMAKE_C_COMPILE_FEATURES "c_std_90;c_function_prototypes;c_std_99;c_restrict;c_variadic_macros;c_std_11;c_static_assert;c_std_17;c_std_23")
|
||||
set(CMAKE_C90_COMPILE_FEATURES "c_std_90;c_function_prototypes")
|
||||
set(CMAKE_C99_COMPILE_FEATURES "c_std_99;c_restrict;c_variadic_macros")
|
||||
set(CMAKE_C11_COMPILE_FEATURES "c_std_11;c_static_assert")
|
||||
set(CMAKE_C17_COMPILE_FEATURES "c_std_17")
|
||||
set(CMAKE_C23_COMPILE_FEATURES "c_std_23")
|
||||
|
||||
set(CMAKE_C_PLATFORM_ID "Linux")
|
||||
set(CMAKE_C_SIMULATE_ID "")
|
||||
set(CMAKE_C_COMPILER_FRONTEND_VARIANT "GNU")
|
||||
set(CMAKE_C_COMPILER_APPLE_SYSROOT "")
|
||||
set(CMAKE_C_SIMULATE_VERSION "")
|
||||
set(CMAKE_C_COMPILER_ARCHITECTURE_ID "x86_64")
|
||||
|
||||
|
||||
|
||||
|
||||
set(CMAKE_AR "/usr/bin/ar")
|
||||
set(CMAKE_C_COMPILER_AR "/usr/bin/gcc-ar")
|
||||
set(CMAKE_RANLIB "/usr/bin/ranlib")
|
||||
set(CMAKE_C_COMPILER_RANLIB "/usr/bin/gcc-ranlib")
|
||||
set(CMAKE_LINKER "/usr/bin/ld")
|
||||
set(CMAKE_LINKER_LINK "")
|
||||
set(CMAKE_LINKER_LLD "")
|
||||
set(CMAKE_C_COMPILER_LINKER "/usr/bin/ld")
|
||||
set(CMAKE_C_COMPILER_LINKER_ID "GNU")
|
||||
set(CMAKE_C_COMPILER_LINKER_VERSION 2.46)
|
||||
set(CMAKE_C_COMPILER_LINKER_FRONTEND_VARIANT GNU)
|
||||
set(CMAKE_MT "")
|
||||
set(CMAKE_TAPI "CMAKE_TAPI-NOTFOUND")
|
||||
set(CMAKE_COMPILER_IS_GNUCC 1)
|
||||
set(CMAKE_C_COMPILER_LOADED 1)
|
||||
set(CMAKE_C_COMPILER_WORKS TRUE)
|
||||
set(CMAKE_C_ABI_COMPILED TRUE)
|
||||
|
||||
set(CMAKE_C_COMPILER_ENV_VAR "CC")
|
||||
|
||||
set(CMAKE_C_COMPILER_ID_RUN 1)
|
||||
set(CMAKE_C_SOURCE_FILE_EXTENSIONS c;m)
|
||||
set(CMAKE_C_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
|
||||
set(CMAKE_C_LINKER_PREFERENCE 10)
|
||||
set(CMAKE_C_LINKER_DEPFILE_SUPPORTED TRUE)
|
||||
set(CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
|
||||
set(CMAKE_C_LINKER_PUSHPOP_STATE_SUPPORTED TRUE)
|
||||
|
||||
# Save compiler ABI information.
|
||||
set(CMAKE_C_SIZEOF_DATA_PTR "8")
|
||||
set(CMAKE_C_COMPILER_ABI "ELF")
|
||||
set(CMAKE_C_BYTE_ORDER "LITTLE_ENDIAN")
|
||||
set(CMAKE_C_LIBRARY_ARCHITECTURE "")
|
||||
|
||||
if(CMAKE_C_SIZEOF_DATA_PTR)
|
||||
set(CMAKE_SIZEOF_VOID_P "${CMAKE_C_SIZEOF_DATA_PTR}")
|
||||
endif()
|
||||
|
||||
if(CMAKE_C_COMPILER_ABI)
|
||||
set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_C_COMPILER_ABI}")
|
||||
endif()
|
||||
|
||||
if(CMAKE_C_LIBRARY_ARCHITECTURE)
|
||||
set(CMAKE_LIBRARY_ARCHITECTURE "")
|
||||
endif()
|
||||
|
||||
set(CMAKE_C_CL_SHOWINCLUDES_PREFIX "")
|
||||
if(CMAKE_C_CL_SHOWINCLUDES_PREFIX)
|
||||
set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_C_CL_SHOWINCLUDES_PREFIX}")
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
set(CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES "/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1/include;/usr/local/include;/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1/include-fixed;/usr/include")
|
||||
set(CMAKE_C_IMPLICIT_LINK_LIBRARIES "gcc;gcc_s;c;gcc;gcc_s")
|
||||
set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-pc-linux-gnu/15.2.1;/usr/lib;/lib")
|
||||
set(CMAKE_C_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
|
||||
Binary file not shown.
|
|
@ -0,0 +1,15 @@
|
|||
set(CMAKE_HOST_SYSTEM "Linux-6.19.10-arch1-1")
|
||||
set(CMAKE_HOST_SYSTEM_NAME "Linux")
|
||||
set(CMAKE_HOST_SYSTEM_VERSION "6.19.10-arch1-1")
|
||||
set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
|
||||
|
||||
|
||||
|
||||
set(CMAKE_SYSTEM "Linux-6.19.10-arch1-1")
|
||||
set(CMAKE_SYSTEM_NAME "Linux")
|
||||
set(CMAKE_SYSTEM_VERSION "6.19.10-arch1-1")
|
||||
set(CMAKE_SYSTEM_PROCESSOR "x86_64")
|
||||
|
||||
set(CMAKE_CROSSCOMPILING "FALSE")
|
||||
|
||||
set(CMAKE_SYSTEM_LOADED 1)
|
||||
|
|
@ -0,0 +1,934 @@
|
|||
#ifdef __cplusplus
|
||||
# error "A C++ compiler has been selected for C."
|
||||
#endif
|
||||
|
||||
#if defined(__18CXX)
|
||||
# define ID_VOID_MAIN
|
||||
#endif
|
||||
#if defined(__CLASSIC_C__)
|
||||
/* cv-qualifiers did not exist in K&R C */
|
||||
# define const
|
||||
# define volatile
|
||||
#endif
|
||||
|
||||
#if !defined(__has_include)
|
||||
/* If the compiler does not have __has_include, pretend the answer is
|
||||
always no. */
|
||||
# define __has_include(x) 0
|
||||
#endif
|
||||
|
||||
|
||||
/* Version number components: V=Version, R=Revision, P=Patch
|
||||
Version date components: YYYY=Year, MM=Month, DD=Day */
|
||||
|
||||
#if defined(__INTEL_COMPILER) || defined(__ICC)
|
||||
# define COMPILER_ID "Intel"
|
||||
# if defined(_MSC_VER)
|
||||
# define SIMULATE_ID "MSVC"
|
||||
# endif
|
||||
# if defined(__GNUC__)
|
||||
# define SIMULATE_ID "GNU"
|
||||
# endif
|
||||
/* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
|
||||
except that a few beta releases use the old format with V=2021. */
|
||||
# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
|
||||
# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
|
||||
# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
|
||||
# if defined(__INTEL_COMPILER_UPDATE)
|
||||
# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
|
||||
# else
|
||||
# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10)
|
||||
# endif
|
||||
# else
|
||||
# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
|
||||
# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
|
||||
/* The third version component from --version is an update index,
|
||||
but no macro is provided for it. */
|
||||
# define COMPILER_VERSION_PATCH DEC(0)
|
||||
# endif
|
||||
# if defined(__INTEL_COMPILER_BUILD_DATE)
|
||||
/* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
|
||||
# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
|
||||
# endif
|
||||
# if defined(_MSC_VER)
|
||||
/* _MSC_VER = VVRR */
|
||||
# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
|
||||
# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
|
||||
# endif
|
||||
# if defined(__GNUC__)
|
||||
# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
|
||||
# elif defined(__GNUG__)
|
||||
# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
|
||||
# endif
|
||||
# if defined(__GNUC_MINOR__)
|
||||
# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
|
||||
# endif
|
||||
# if defined(__GNUC_PATCHLEVEL__)
|
||||
# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
|
||||
# endif
|
||||
|
||||
#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
|
||||
# define COMPILER_ID "IntelLLVM"
|
||||
#if defined(_MSC_VER)
|
||||
# define SIMULATE_ID "MSVC"
|
||||
#endif
|
||||
#if defined(__GNUC__)
|
||||
# define SIMULATE_ID "GNU"
|
||||
#endif
|
||||
/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
|
||||
* later. Look for 6 digit vs. 8 digit version number to decide encoding.
|
||||
* VVVV is no smaller than the current year when a version is released.
|
||||
*/
|
||||
#if __INTEL_LLVM_COMPILER < 1000000L
|
||||
# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
|
||||
# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
|
||||
# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 10)
|
||||
#else
|
||||
# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
|
||||
# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
|
||||
# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 100)
|
||||
#endif
|
||||
#if defined(_MSC_VER)
|
||||
/* _MSC_VER = VVRR */
|
||||
# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
|
||||
# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
|
||||
#endif
|
||||
#if defined(__GNUC__)
|
||||
# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
|
||||
#elif defined(__GNUG__)
|
||||
# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
|
||||
#endif
|
||||
#if defined(__GNUC_MINOR__)
|
||||
# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
|
||||
#endif
|
||||
#if defined(__GNUC_PATCHLEVEL__)
|
||||
# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
|
||||
#endif
|
||||
|
||||
#elif defined(__PATHCC__)
|
||||
# define COMPILER_ID "PathScale"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
|
||||
# if defined(__PATHCC_PATCHLEVEL__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
|
||||
# endif
|
||||
|
||||
#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
|
||||
# define COMPILER_ID "Embarcadero"
|
||||
# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
|
||||
# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
|
||||
# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF)
|
||||
|
||||
#elif defined(__BORLANDC__)
|
||||
# define COMPILER_ID "Borland"
|
||||
/* __BORLANDC__ = 0xVRR */
|
||||
# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
|
||||
# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
|
||||
|
||||
#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
|
||||
# define COMPILER_ID "Watcom"
|
||||
/* __WATCOMC__ = VVRR */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
|
||||
# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
|
||||
# if (__WATCOMC__ % 10) > 0
|
||||
# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
|
||||
# endif
|
||||
|
||||
#elif defined(__WATCOMC__)
|
||||
# define COMPILER_ID "OpenWatcom"
|
||||
/* __WATCOMC__ = VVRP + 1100 */
|
||||
# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
|
||||
# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
|
||||
# if (__WATCOMC__ % 10) > 0
|
||||
# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
|
||||
# endif
|
||||
|
||||
#elif defined(__SUNPRO_C)
|
||||
# define COMPILER_ID "SunPro"
|
||||
# if __SUNPRO_C >= 0x5100
|
||||
/* __SUNPRO_C = 0xVRRP */
|
||||
# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>12)
|
||||
# define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xFF)
|
||||
# define COMPILER_VERSION_PATCH HEX(__SUNPRO_C & 0xF)
|
||||
# else
|
||||
/* __SUNPRO_CC = 0xVRP */
|
||||
# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>8)
|
||||
# define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xF)
|
||||
# define COMPILER_VERSION_PATCH HEX(__SUNPRO_C & 0xF)
|
||||
# endif
|
||||
|
||||
#elif defined(__HP_cc)
|
||||
# define COMPILER_ID "HP"
|
||||
/* __HP_cc = VVRRPP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__HP_cc/10000)
|
||||
# define COMPILER_VERSION_MINOR DEC(__HP_cc/100 % 100)
|
||||
# define COMPILER_VERSION_PATCH DEC(__HP_cc % 100)
|
||||
|
||||
#elif defined(__DECC)
|
||||
# define COMPILER_ID "Compaq"
|
||||
/* __DECC_VER = VVRRTPPPP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__DECC_VER/10000000)
|
||||
# define COMPILER_VERSION_MINOR DEC(__DECC_VER/100000 % 100)
|
||||
# define COMPILER_VERSION_PATCH DEC(__DECC_VER % 10000)
|
||||
|
||||
#elif defined(__IBMC__) && defined(__COMPILER_VER__)
|
||||
# define COMPILER_ID "zOS"
|
||||
/* __IBMC__ = VRP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
|
||||
# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
|
||||
# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
|
||||
|
||||
#elif defined(__open_xl__) && defined(__clang__)
|
||||
# define COMPILER_ID "IBMClang"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__open_xl_version__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__open_xl_release__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__open_xl_modification__)
|
||||
# define COMPILER_VERSION_TWEAK DEC(__open_xl_ptf_fix_level__)
|
||||
# define COMPILER_VERSION_INTERNAL_STR __clang_version__
|
||||
|
||||
|
||||
#elif defined(__ibmxl__) && defined(__clang__)
|
||||
# define COMPILER_ID "XLClang"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
|
||||
# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
|
||||
|
||||
|
||||
#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800
|
||||
# define COMPILER_ID "XL"
|
||||
/* __IBMC__ = VRP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
|
||||
# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
|
||||
# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
|
||||
|
||||
#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800
|
||||
# define COMPILER_ID "VisualAge"
|
||||
/* __IBMC__ = VRP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
|
||||
# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
|
||||
# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
|
||||
|
||||
#elif defined(__NVCOMPILER)
|
||||
# define COMPILER_ID "NVHPC"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
|
||||
# if defined(__NVCOMPILER_PATCHLEVEL__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
|
||||
# endif
|
||||
|
||||
#elif defined(__PGI)
|
||||
# define COMPILER_ID "PGI"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
|
||||
# if defined(__PGIC_PATCHLEVEL__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
|
||||
# endif
|
||||
|
||||
#elif defined(__clang__) && defined(__cray__)
|
||||
# define COMPILER_ID "CrayClang"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__cray_major__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__cray_minor__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__cray_patchlevel__)
|
||||
# define COMPILER_VERSION_INTERNAL_STR __clang_version__
|
||||
|
||||
|
||||
#elif defined(_CRAYC)
|
||||
# define COMPILER_ID "Cray"
|
||||
# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
|
||||
# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
|
||||
|
||||
#elif defined(__TI_COMPILER_VERSION__)
|
||||
# define COMPILER_ID "TI"
|
||||
/* __TI_COMPILER_VERSION__ = VVVRRRPPP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
|
||||
# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000)
|
||||
# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000)
|
||||
|
||||
#elif defined(__CLANG_FUJITSU)
|
||||
# define COMPILER_ID "FujitsuClang"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
|
||||
# define COMPILER_VERSION_INTERNAL_STR __clang_version__
|
||||
|
||||
|
||||
#elif defined(__FUJITSU)
|
||||
# define COMPILER_ID "Fujitsu"
|
||||
# if defined(__FCC_version__)
|
||||
# define COMPILER_VERSION __FCC_version__
|
||||
# elif defined(__FCC_major__)
|
||||
# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
|
||||
# endif
|
||||
# if defined(__fcc_version)
|
||||
# define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
|
||||
# elif defined(__FCC_VERSION)
|
||||
# define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
|
||||
# endif
|
||||
|
||||
|
||||
#elif defined(__ghs__)
|
||||
# define COMPILER_ID "GHS"
|
||||
/* __GHS_VERSION_NUMBER = VVVVRP */
|
||||
# ifdef __GHS_VERSION_NUMBER
|
||||
# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
|
||||
# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
|
||||
# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER % 10)
|
||||
# endif
|
||||
|
||||
#elif defined(__TASKING__)
|
||||
# define COMPILER_ID "Tasking"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__VERSION__/1000)
|
||||
# define COMPILER_VERSION_MINOR DEC(__VERSION__ % 100)
|
||||
# define COMPILER_VERSION_INTERNAL DEC(__VERSION__)
|
||||
|
||||
#elif defined(__ORANGEC__)
|
||||
# define COMPILER_ID "OrangeC"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__ORANGEC_MAJOR__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__ORANGEC_MINOR__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__ORANGEC_PATCHLEVEL__)
|
||||
|
||||
#elif defined(__RENESAS__)
|
||||
# define COMPILER_ID "Renesas"
|
||||
/* __RENESAS_VERSION__ = 0xVVRRPP00 */
|
||||
# define COMPILER_VERSION_MAJOR HEX(__RENESAS_VERSION__ >> 24 & 0xFF)
|
||||
# define COMPILER_VERSION_MINOR HEX(__RENESAS_VERSION__ >> 16 & 0xFF)
|
||||
# define COMPILER_VERSION_PATCH HEX(__RENESAS_VERSION__ >> 8 & 0xFF)
|
||||
|
||||
#elif defined(__TINYC__)
|
||||
# define COMPILER_ID "TinyCC"
|
||||
|
||||
#elif defined(__BCC__)
|
||||
# define COMPILER_ID "Bruce"
|
||||
|
||||
#elif defined(__SCO_VERSION__)
|
||||
# define COMPILER_ID "SCO"
|
||||
|
||||
#elif defined(__ARMCC_VERSION) && !defined(__clang__)
|
||||
# define COMPILER_ID "ARMCC"
|
||||
#if __ARMCC_VERSION >= 1000000
|
||||
/* __ARMCC_VERSION = VRRPPPP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
|
||||
# define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
|
||||
# define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000)
|
||||
#else
|
||||
/* __ARMCC_VERSION = VRPPPP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
|
||||
# define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
|
||||
# define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000)
|
||||
#endif
|
||||
|
||||
|
||||
#elif defined(__clang__) && defined(__apple_build_version__)
|
||||
# define COMPILER_ID "AppleClang"
|
||||
# if defined(_MSC_VER)
|
||||
# define SIMULATE_ID "MSVC"
|
||||
# endif
|
||||
# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
|
||||
# if defined(_MSC_VER)
|
||||
/* _MSC_VER = VVRR */
|
||||
# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
|
||||
# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
|
||||
# endif
|
||||
# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
|
||||
|
||||
#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
|
||||
# define COMPILER_ID "ARMClang"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
|
||||
# define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
|
||||
# define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION/100 % 100)
|
||||
# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
|
||||
|
||||
#elif defined(__clang__) && defined(__ti__)
|
||||
# define COMPILER_ID "TIClang"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__ti_major__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__ti_minor__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__ti_patchlevel__)
|
||||
# define COMPILER_VERSION_INTERNAL DEC(__ti_version__)
|
||||
|
||||
#elif defined(__clang__)
|
||||
# define COMPILER_ID "Clang"
|
||||
# if defined(_MSC_VER)
|
||||
# define SIMULATE_ID "MSVC"
|
||||
# endif
|
||||
# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
|
||||
# if defined(_MSC_VER)
|
||||
/* _MSC_VER = VVRR */
|
||||
# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
|
||||
# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
|
||||
# endif
|
||||
|
||||
#elif defined(__LCC__) && (defined(__GNUC__) || defined(__GNUG__) || defined(__MCST__))
|
||||
# define COMPILER_ID "LCC"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__LCC__ / 100)
|
||||
# define COMPILER_VERSION_MINOR DEC(__LCC__ % 100)
|
||||
# if defined(__LCC_MINOR__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__LCC_MINOR__)
|
||||
# endif
|
||||
# if defined(__GNUC__) && defined(__GNUC_MINOR__)
|
||||
# define SIMULATE_ID "GNU"
|
||||
# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
|
||||
# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
|
||||
# if defined(__GNUC_PATCHLEVEL__)
|
||||
# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
#elif defined(__GNUC__)
|
||||
# define COMPILER_ID "GNU"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
|
||||
# if defined(__GNUC_MINOR__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
|
||||
# endif
|
||||
# if defined(__GNUC_PATCHLEVEL__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
|
||||
# endif
|
||||
|
||||
#elif defined(_MSC_VER)
|
||||
# define COMPILER_ID "MSVC"
|
||||
/* _MSC_VER = VVRR */
|
||||
# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
|
||||
# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
|
||||
# if defined(_MSC_FULL_VER)
|
||||
# if _MSC_VER >= 1400
|
||||
/* _MSC_FULL_VER = VVRRPPPPP */
|
||||
# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
|
||||
# else
|
||||
/* _MSC_FULL_VER = VVRRPPPP */
|
||||
# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
|
||||
# endif
|
||||
# endif
|
||||
# if defined(_MSC_BUILD)
|
||||
# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
|
||||
# endif
|
||||
|
||||
#elif defined(_ADI_COMPILER)
|
||||
# define COMPILER_ID "ADSP"
|
||||
#if defined(__VERSIONNUM__)
|
||||
/* __VERSIONNUM__ = 0xVVRRPPTT */
|
||||
# define COMPILER_VERSION_MAJOR DEC(__VERSIONNUM__ >> 24 & 0xFF)
|
||||
# define COMPILER_VERSION_MINOR DEC(__VERSIONNUM__ >> 16 & 0xFF)
|
||||
# define COMPILER_VERSION_PATCH DEC(__VERSIONNUM__ >> 8 & 0xFF)
|
||||
# define COMPILER_VERSION_TWEAK DEC(__VERSIONNUM__ & 0xFF)
|
||||
#endif
|
||||
|
||||
#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
|
||||
# define COMPILER_ID "IAR"
|
||||
# if defined(__VER__) && defined(__ICCARM__)
|
||||
# define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
|
||||
# define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
|
||||
# define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
|
||||
# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
|
||||
# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
|
||||
# define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
|
||||
# define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
|
||||
# define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
|
||||
# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
|
||||
# endif
|
||||
|
||||
#elif defined(__DCC__) && defined(_DIAB_TOOL)
|
||||
# define COMPILER_ID "Diab"
|
||||
# define COMPILER_VERSION_MAJOR DEC(__VERSION_MAJOR_NUMBER__)
|
||||
# define COMPILER_VERSION_MINOR DEC(__VERSION_MINOR_NUMBER__)
|
||||
# define COMPILER_VERSION_PATCH DEC(__VERSION_ARCH_FEATURE_NUMBER__)
|
||||
# define COMPILER_VERSION_TWEAK DEC(__VERSION_BUG_FIX_NUMBER__)
|
||||
|
||||
|
||||
#elif defined(__SDCC_VERSION_MAJOR) || defined(SDCC)
|
||||
# define COMPILER_ID "SDCC"
|
||||
# if defined(__SDCC_VERSION_MAJOR)
|
||||
# define COMPILER_VERSION_MAJOR DEC(__SDCC_VERSION_MAJOR)
|
||||
# define COMPILER_VERSION_MINOR DEC(__SDCC_VERSION_MINOR)
|
||||
# define COMPILER_VERSION_PATCH DEC(__SDCC_VERSION_PATCH)
|
||||
# else
|
||||
/* SDCC = VRP */
|
||||
# define COMPILER_VERSION_MAJOR DEC(SDCC/100)
|
||||
# define COMPILER_VERSION_MINOR DEC(SDCC/10 % 10)
|
||||
# define COMPILER_VERSION_PATCH DEC(SDCC % 10)
|
||||
# endif
|
||||
|
||||
|
||||
/* These compilers are either not known or too old to define an
|
||||
identification macro. Try to identify the platform and guess that
|
||||
it is the native compiler. */
|
||||
#elif defined(__hpux) || defined(__hpua)
|
||||
# define COMPILER_ID "HP"
|
||||
|
||||
#else /* unknown compiler */
|
||||
# define COMPILER_ID ""
|
||||
#endif
|
||||
|
||||
/* Construct the string literal in pieces to prevent the source from
|
||||
getting matched. Store it in a pointer rather than an array
|
||||
because some compilers will just produce instructions to fill the
|
||||
array rather than assigning a pointer to a static array. */
|
||||
char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
|
||||
#ifdef SIMULATE_ID
|
||||
char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
|
||||
#endif
|
||||
|
||||
#ifdef __QNXNTO__
|
||||
char const* qnxnto = "INFO" ":" "qnxnto[]";
|
||||
#endif
|
||||
|
||||
#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
|
||||
char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
|
||||
#endif
|
||||
|
||||
#define STRINGIFY_HELPER(X) #X
|
||||
#define STRINGIFY(X) STRINGIFY_HELPER(X)
|
||||
|
||||
/* Identify known platforms by name. */
|
||||
#if defined(__linux) || defined(__linux__) || defined(linux)
|
||||
# define PLATFORM_ID "Linux"
|
||||
|
||||
#elif defined(__MSYS__)
|
||||
# define PLATFORM_ID "MSYS"
|
||||
|
||||
#elif defined(__CYGWIN__)
|
||||
# define PLATFORM_ID "Cygwin"
|
||||
|
||||
#elif defined(__MINGW32__)
|
||||
# define PLATFORM_ID "MinGW"
|
||||
|
||||
#elif defined(__APPLE__)
|
||||
# define PLATFORM_ID "Darwin"
|
||||
|
||||
#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
|
||||
# define PLATFORM_ID "Windows"
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__FreeBSD)
|
||||
# define PLATFORM_ID "FreeBSD"
|
||||
|
||||
#elif defined(__NetBSD__) || defined(__NetBSD)
|
||||
# define PLATFORM_ID "NetBSD"
|
||||
|
||||
#elif defined(__OpenBSD__) || defined(__OPENBSD)
|
||||
# define PLATFORM_ID "OpenBSD"
|
||||
|
||||
#elif defined(__sun) || defined(sun)
|
||||
# define PLATFORM_ID "SunOS"
|
||||
|
||||
#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
|
||||
# define PLATFORM_ID "AIX"
|
||||
|
||||
#elif defined(__hpux) || defined(__hpux__)
|
||||
# define PLATFORM_ID "HP-UX"
|
||||
|
||||
#elif defined(__HAIKU__)
|
||||
# define PLATFORM_ID "Haiku"
|
||||
|
||||
#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
|
||||
# define PLATFORM_ID "BeOS"
|
||||
|
||||
#elif defined(__QNX__) || defined(__QNXNTO__)
|
||||
# define PLATFORM_ID "QNX"
|
||||
|
||||
#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
|
||||
# define PLATFORM_ID "Tru64"
|
||||
|
||||
#elif defined(__riscos) || defined(__riscos__)
|
||||
# define PLATFORM_ID "RISCos"
|
||||
|
||||
#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
|
||||
# define PLATFORM_ID "SINIX"
|
||||
|
||||
#elif defined(__UNIX_SV__)
|
||||
# define PLATFORM_ID "UNIX_SV"
|
||||
|
||||
#elif defined(__bsdos__)
|
||||
# define PLATFORM_ID "BSDOS"
|
||||
|
||||
#elif defined(_MPRAS) || defined(MPRAS)
|
||||
# define PLATFORM_ID "MP-RAS"
|
||||
|
||||
#elif defined(__osf) || defined(__osf__)
|
||||
# define PLATFORM_ID "OSF1"
|
||||
|
||||
#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
|
||||
# define PLATFORM_ID "SCO_SV"
|
||||
|
||||
#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
|
||||
# define PLATFORM_ID "ULTRIX"
|
||||
|
||||
#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
|
||||
# define PLATFORM_ID "Xenix"
|
||||
|
||||
#elif defined(__WATCOMC__)
|
||||
# if defined(__LINUX__)
|
||||
# define PLATFORM_ID "Linux"
|
||||
|
||||
# elif defined(__DOS__)
|
||||
# define PLATFORM_ID "DOS"
|
||||
|
||||
# elif defined(__OS2__)
|
||||
# define PLATFORM_ID "OS2"
|
||||
|
||||
# elif defined(__WINDOWS__)
|
||||
# define PLATFORM_ID "Windows3x"
|
||||
|
||||
# elif defined(__VXWORKS__)
|
||||
# define PLATFORM_ID "VxWorks"
|
||||
|
||||
# else /* unknown platform */
|
||||
# define PLATFORM_ID
|
||||
# endif
|
||||
|
||||
#elif defined(__INTEGRITY)
|
||||
# if defined(INT_178B)
|
||||
# define PLATFORM_ID "Integrity178"
|
||||
|
||||
# else /* regular Integrity */
|
||||
# define PLATFORM_ID "Integrity"
|
||||
# endif
|
||||
|
||||
# elif defined(_ADI_COMPILER)
|
||||
# define PLATFORM_ID "ADSP"
|
||||
|
||||
#else /* unknown platform */
|
||||
# define PLATFORM_ID
|
||||
|
||||
#endif
|
||||
|
||||
/* For windows compilers MSVC and Intel we can determine
|
||||
the architecture of the compiler being used. This is because
|
||||
the compilers do not have flags that can change the architecture,
|
||||
but rather depend on which compiler is being used
|
||||
*/
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
# if defined(_M_IA64)
|
||||
# define ARCHITECTURE_ID "IA64"
|
||||
|
||||
# elif defined(_M_ARM64EC)
|
||||
# define ARCHITECTURE_ID "ARM64EC"
|
||||
|
||||
# elif defined(_M_X64) || defined(_M_AMD64)
|
||||
# define ARCHITECTURE_ID "x64"
|
||||
|
||||
# elif defined(_M_IX86)
|
||||
# define ARCHITECTURE_ID "X86"
|
||||
|
||||
# elif defined(_M_ARM64)
|
||||
# define ARCHITECTURE_ID "ARM64"
|
||||
|
||||
# elif defined(_M_ARM)
|
||||
# if _M_ARM == 4
|
||||
# define ARCHITECTURE_ID "ARMV4I"
|
||||
# elif _M_ARM == 5
|
||||
# define ARCHITECTURE_ID "ARMV5I"
|
||||
# else
|
||||
# define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
|
||||
# endif
|
||||
|
||||
# elif defined(_M_MIPS)
|
||||
# define ARCHITECTURE_ID "MIPS"
|
||||
|
||||
# elif defined(_M_SH)
|
||||
# define ARCHITECTURE_ID "SHx"
|
||||
|
||||
# else /* unknown architecture */
|
||||
# define ARCHITECTURE_ID ""
|
||||
# endif
|
||||
|
||||
#elif defined(__WATCOMC__)
|
||||
# if defined(_M_I86)
|
||||
# define ARCHITECTURE_ID "I86"
|
||||
|
||||
# elif defined(_M_IX86)
|
||||
# define ARCHITECTURE_ID "X86"
|
||||
|
||||
# else /* unknown architecture */
|
||||
# define ARCHITECTURE_ID ""
|
||||
# endif
|
||||
|
||||
#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
|
||||
# if defined(__ICCARM__)
|
||||
# define ARCHITECTURE_ID "ARM"
|
||||
|
||||
# elif defined(__ICCRX__)
|
||||
# define ARCHITECTURE_ID "RX"
|
||||
|
||||
# elif defined(__ICCRH850__)
|
||||
# define ARCHITECTURE_ID "RH850"
|
||||
|
||||
# elif defined(__ICCRL78__)
|
||||
# define ARCHITECTURE_ID "RL78"
|
||||
|
||||
# elif defined(__ICCRISCV__)
|
||||
# define ARCHITECTURE_ID "RISCV"
|
||||
|
||||
# elif defined(__ICCAVR__)
|
||||
# define ARCHITECTURE_ID "AVR"
|
||||
|
||||
# elif defined(__ICC430__)
|
||||
# define ARCHITECTURE_ID "MSP430"
|
||||
|
||||
# elif defined(__ICCV850__)
|
||||
# define ARCHITECTURE_ID "V850"
|
||||
|
||||
# elif defined(__ICC8051__)
|
||||
# define ARCHITECTURE_ID "8051"
|
||||
|
||||
# elif defined(__ICCSTM8__)
|
||||
# define ARCHITECTURE_ID "STM8"
|
||||
|
||||
# else /* unknown architecture */
|
||||
# define ARCHITECTURE_ID ""
|
||||
# endif
|
||||
|
||||
#elif defined(__ghs__)
|
||||
# if defined(__PPC64__)
|
||||
# define ARCHITECTURE_ID "PPC64"
|
||||
|
||||
# elif defined(__ppc__)
|
||||
# define ARCHITECTURE_ID "PPC"
|
||||
|
||||
# elif defined(__ARM__)
|
||||
# define ARCHITECTURE_ID "ARM"
|
||||
|
||||
# elif defined(__x86_64__)
|
||||
# define ARCHITECTURE_ID "x64"
|
||||
|
||||
# elif defined(__i386__)
|
||||
# define ARCHITECTURE_ID "X86"
|
||||
|
||||
# else /* unknown architecture */
|
||||
# define ARCHITECTURE_ID ""
|
||||
# endif
|
||||
|
||||
#elif defined(__clang__) && defined(__ti__)
|
||||
# if defined(__ARM_ARCH)
|
||||
# define ARCHITECTURE_ID "ARM"
|
||||
|
||||
# else /* unknown architecture */
|
||||
# define ARCHITECTURE_ID ""
|
||||
# endif
|
||||
|
||||
#elif defined(__TI_COMPILER_VERSION__)
|
||||
# if defined(__TI_ARM__)
|
||||
# define ARCHITECTURE_ID "ARM"
|
||||
|
||||
# elif defined(__MSP430__)
|
||||
# define ARCHITECTURE_ID "MSP430"
|
||||
|
||||
# elif defined(__TMS320C28XX__)
|
||||
# define ARCHITECTURE_ID "TMS320C28x"
|
||||
|
||||
# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
|
||||
# define ARCHITECTURE_ID "TMS320C6x"
|
||||
|
||||
# else /* unknown architecture */
|
||||
# define ARCHITECTURE_ID ""
|
||||
# endif
|
||||
|
||||
# elif defined(__ADSPSHARC__)
|
||||
# define ARCHITECTURE_ID "SHARC"
|
||||
|
||||
# elif defined(__ADSPBLACKFIN__)
|
||||
# define ARCHITECTURE_ID "Blackfin"
|
||||
|
||||
#elif defined(__TASKING__)
|
||||
|
||||
# if defined(__CTC__) || defined(__CPTC__)
|
||||
# define ARCHITECTURE_ID "TriCore"
|
||||
|
||||
# elif defined(__CMCS__)
|
||||
# define ARCHITECTURE_ID "MCS"
|
||||
|
||||
# elif defined(__CARM__) || defined(__CPARM__)
|
||||
# define ARCHITECTURE_ID "ARM"
|
||||
|
||||
# elif defined(__CARC__)
|
||||
# define ARCHITECTURE_ID "ARC"
|
||||
|
||||
# elif defined(__C51__)
|
||||
# define ARCHITECTURE_ID "8051"
|
||||
|
||||
# elif defined(__CPCP__)
|
||||
# define ARCHITECTURE_ID "PCP"
|
||||
|
||||
# else
|
||||
# define ARCHITECTURE_ID ""
|
||||
# endif
|
||||
|
||||
#elif defined(__RENESAS__)
|
||||
# if defined(__CCRX__)
|
||||
# define ARCHITECTURE_ID "RX"
|
||||
|
||||
# elif defined(__CCRL__)
|
||||
# define ARCHITECTURE_ID "RL78"
|
||||
|
||||
# elif defined(__CCRH__)
|
||||
# define ARCHITECTURE_ID "RH850"
|
||||
|
||||
# else
|
||||
# define ARCHITECTURE_ID ""
|
||||
# endif
|
||||
|
||||
#else
|
||||
# define ARCHITECTURE_ID
|
||||
#endif
|
||||
|
||||
/* Convert integer to decimal digit literals. */
|
||||
#define DEC(n) \
|
||||
('0' + (((n) / 10000000)%10)), \
|
||||
('0' + (((n) / 1000000)%10)), \
|
||||
('0' + (((n) / 100000)%10)), \
|
||||
('0' + (((n) / 10000)%10)), \
|
||||
('0' + (((n) / 1000)%10)), \
|
||||
('0' + (((n) / 100)%10)), \
|
||||
('0' + (((n) / 10)%10)), \
|
||||
('0' + ((n) % 10))
|
||||
|
||||
/* Convert integer to hex digit literals. */
|
||||
#define HEX(n) \
|
||||
('0' + ((n)>>28 & 0xF)), \
|
||||
('0' + ((n)>>24 & 0xF)), \
|
||||
('0' + ((n)>>20 & 0xF)), \
|
||||
('0' + ((n)>>16 & 0xF)), \
|
||||
('0' + ((n)>>12 & 0xF)), \
|
||||
('0' + ((n)>>8 & 0xF)), \
|
||||
('0' + ((n)>>4 & 0xF)), \
|
||||
('0' + ((n) & 0xF))
|
||||
|
||||
/* Construct a string literal encoding the version number. */
|
||||
#ifdef COMPILER_VERSION
|
||||
char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
|
||||
|
||||
/* Construct a string literal encoding the version number components. */
|
||||
#elif defined(COMPILER_VERSION_MAJOR)
|
||||
char const info_version[] = {
|
||||
'I', 'N', 'F', 'O', ':',
|
||||
'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
|
||||
COMPILER_VERSION_MAJOR,
|
||||
# ifdef COMPILER_VERSION_MINOR
|
||||
'.', COMPILER_VERSION_MINOR,
|
||||
# ifdef COMPILER_VERSION_PATCH
|
||||
'.', COMPILER_VERSION_PATCH,
|
||||
# ifdef COMPILER_VERSION_TWEAK
|
||||
'.', COMPILER_VERSION_TWEAK,
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
']','\0'};
|
||||
#endif
|
||||
|
||||
/* Construct a string literal encoding the internal version number. */
|
||||
#ifdef COMPILER_VERSION_INTERNAL
|
||||
char const info_version_internal[] = {
|
||||
'I', 'N', 'F', 'O', ':',
|
||||
'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
|
||||
'i','n','t','e','r','n','a','l','[',
|
||||
COMPILER_VERSION_INTERNAL,']','\0'};
|
||||
#elif defined(COMPILER_VERSION_INTERNAL_STR)
|
||||
char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
|
||||
#endif
|
||||
|
||||
/* Construct a string literal encoding the version number components. */
|
||||
#ifdef SIMULATE_VERSION_MAJOR
|
||||
char const info_simulate_version[] = {
|
||||
'I', 'N', 'F', 'O', ':',
|
||||
's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
|
||||
SIMULATE_VERSION_MAJOR,
|
||||
# ifdef SIMULATE_VERSION_MINOR
|
||||
'.', SIMULATE_VERSION_MINOR,
|
||||
# ifdef SIMULATE_VERSION_PATCH
|
||||
'.', SIMULATE_VERSION_PATCH,
|
||||
# ifdef SIMULATE_VERSION_TWEAK
|
||||
'.', SIMULATE_VERSION_TWEAK,
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
']','\0'};
|
||||
#endif
|
||||
|
||||
/* Construct the string literal in pieces to prevent the source from
|
||||
getting matched. Store it in a pointer rather than an array
|
||||
because some compilers will just produce instructions to fill the
|
||||
array rather than assigning a pointer to a static array. */
|
||||
char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
|
||||
char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
|
||||
|
||||
|
||||
|
||||
#define C_STD_99 199901L
|
||||
#define C_STD_11 201112L
|
||||
#define C_STD_17 201710L
|
||||
#define C_STD_23 202311L
|
||||
|
||||
#ifdef __STDC_VERSION__
|
||||
# define C_STD __STDC_VERSION__
|
||||
#endif
|
||||
|
||||
#if !defined(__STDC__) && !defined(__clang__) && !defined(__RENESAS__)
|
||||
# if defined(_MSC_VER) || defined(__ibmxl__) || defined(__IBMC__)
|
||||
# define C_VERSION "90"
|
||||
# else
|
||||
# define C_VERSION
|
||||
# endif
|
||||
#elif C_STD > C_STD_17
|
||||
# define C_VERSION "23"
|
||||
#elif C_STD > C_STD_11
|
||||
# define C_VERSION "17"
|
||||
#elif C_STD > C_STD_99
|
||||
# define C_VERSION "11"
|
||||
#elif C_STD >= C_STD_99
|
||||
# define C_VERSION "99"
|
||||
#else
|
||||
# define C_VERSION "90"
|
||||
#endif
|
||||
const char* info_language_standard_default =
|
||||
"INFO" ":" "standard_default[" C_VERSION "]";
|
||||
|
||||
const char* info_language_extensions_default = "INFO" ":" "extensions_default["
|
||||
#if (defined(__clang__) || defined(__GNUC__) || defined(__xlC__) || \
|
||||
defined(__TI_COMPILER_VERSION__) || defined(__RENESAS__)) && \
|
||||
!defined(__STRICT_ANSI__)
|
||||
"ON"
|
||||
#else
|
||||
"OFF"
|
||||
#endif
|
||||
"]";
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#ifdef ID_VOID_MAIN
|
||||
void main() {}
|
||||
#else
|
||||
# if defined(__CLASSIC_C__)
|
||||
int main(argc, argv) int argc; char *argv[];
|
||||
# else
|
||||
int main(int argc, char* argv[])
|
||||
# endif
|
||||
{
|
||||
int require = 0;
|
||||
require += info_compiler[argc];
|
||||
require += info_platform[argc];
|
||||
require += info_arch[argc];
|
||||
#ifdef COMPILER_VERSION_MAJOR
|
||||
require += info_version[argc];
|
||||
#endif
|
||||
#if defined(COMPILER_VERSION_INTERNAL) || defined(COMPILER_VERSION_INTERNAL_STR)
|
||||
require += info_version_internal[argc];
|
||||
#endif
|
||||
#ifdef SIMULATE_ID
|
||||
require += info_simulate[argc];
|
||||
#endif
|
||||
#ifdef SIMULATE_VERSION_MAJOR
|
||||
require += info_simulate_version[argc];
|
||||
#endif
|
||||
#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
|
||||
require += info_cray[argc];
|
||||
#endif
|
||||
require += info_language_standard_default[argc];
|
||||
require += info_language_extensions_default[argc];
|
||||
(void)argv;
|
||||
return require;
|
||||
}
|
||||
#endif
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1 @@
|
|||
# This file is generated by cmake for dependency checking of the CMakeCache.txt file
|
||||
|
|
@ -1,28 +1,189 @@
|
|||
/*
|
||||
* This file comes from the Kyber repo; see the files in kyber/avx2/test or kyber/ref/test for further details.
|
||||
* pqc-bench harness — cycle-count + optional PAPI hardware counter benchmarks.
|
||||
*
|
||||
* Usage: <binary> [nspins]
|
||||
* nspins number of outer loop-spin iterations (default: 1)
|
||||
*
|
||||
* Each spin runs all operations with NTESTS inner iterations and prints one
|
||||
* median/average pair per operation. With WITH_PAPI, additional lines are
|
||||
* emitted per hardware counter using the same parseable format.
|
||||
*
|
||||
* Build flags:
|
||||
* -DWITH_PAPI link against PAPI and emit hardware counter lines
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "../kem.h"
|
||||
#include "../params.h"
|
||||
#include "../indcpa.h"
|
||||
#include "../polyvec.h"
|
||||
#include "../poly.h"
|
||||
#include "../randombytes.h"
|
||||
#include <string.h>
|
||||
#include "kem.h"
|
||||
#include "params.h"
|
||||
#include "indcpa.h"
|
||||
#include "polyvec.h"
|
||||
#include "poly.h"
|
||||
#include "randombytes.h"
|
||||
#include "cpucycles.h"
|
||||
#include "speed_print.h"
|
||||
|
||||
#ifdef WITH_PAPI
|
||||
#include <papi.h>
|
||||
#endif
|
||||
|
||||
#define NTESTS 1000
|
||||
|
||||
uint64_t t[NTESTS];
|
||||
uint8_t seed[KYBER_SYMBYTES] = {0};
|
||||
/* ── PAPI instrumentation ───────────────────────────────────────────────── */
|
||||
#ifdef WITH_PAPI
|
||||
|
||||
int main(void)
|
||||
typedef struct {
|
||||
int code;
|
||||
const char *name;
|
||||
} papi_event_def;
|
||||
|
||||
static const papi_event_def DESIRED_EVENTS[] = {
|
||||
{ PAPI_TOT_INS, "instructions" },
|
||||
{ PAPI_L1_DCM, "l1_misses" },
|
||||
{ PAPI_L2_TCM, "l2_misses" },
|
||||
{ PAPI_L3_TCM, "l3_misses" },
|
||||
{ PAPI_BR_MSP, "branch_mispreds" },
|
||||
};
|
||||
#define MAX_EVENTS ((int)(sizeof(DESIRED_EVENTS) / sizeof(DESIRED_EVENTS[0])))
|
||||
|
||||
static int papi_eventset = PAPI_NULL;
|
||||
static int active_codes[MAX_EVENTS];
|
||||
static const char *active_names[MAX_EVENTS];
|
||||
static int n_active = 0;
|
||||
static int papi_ok = 0; /* set to 1 if init succeeded */
|
||||
|
||||
static void papi_init(void) {
|
||||
int ret;
|
||||
|
||||
ret = PAPI_library_init(PAPI_VER_CURRENT);
|
||||
if (ret != PAPI_VER_CURRENT) {
|
||||
fprintf(stderr, "PAPI_library_init: %s — hardware counters disabled\n",
|
||||
PAPI_strerror(ret));
|
||||
return;
|
||||
}
|
||||
|
||||
if ((ret = PAPI_create_eventset(&papi_eventset)) != PAPI_OK) {
|
||||
fprintf(stderr, "PAPI_create_eventset: %s — hardware counters disabled\n",
|
||||
PAPI_strerror(ret));
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < MAX_EVENTS; i++) {
|
||||
if (PAPI_query_event(DESIRED_EVENTS[i].code) != PAPI_OK) {
|
||||
fprintf(stderr, "PAPI: event %s not available on this hardware, skipping\n",
|
||||
DESIRED_EVENTS[i].name);
|
||||
continue;
|
||||
}
|
||||
ret = PAPI_add_event(papi_eventset, DESIRED_EVENTS[i].code);
|
||||
if (ret != PAPI_OK) {
|
||||
fprintf(stderr, "PAPI_add_event(%s): %s — skipping\n",
|
||||
DESIRED_EVENTS[i].name, PAPI_strerror(ret));
|
||||
continue;
|
||||
}
|
||||
active_codes[n_active] = DESIRED_EVENTS[i].code;
|
||||
active_names[n_active] = DESIRED_EVENTS[i].name;
|
||||
n_active++;
|
||||
}
|
||||
|
||||
if (n_active == 0) {
|
||||
fprintf(stderr, "PAPI: no events could be added — hardware counters disabled\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if ((ret = PAPI_start(papi_eventset)) != PAPI_OK) {
|
||||
fprintf(stderr, "PAPI_start: %s — hardware counters disabled\n",
|
||||
PAPI_strerror(ret));
|
||||
return;
|
||||
}
|
||||
|
||||
papi_ok = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* papi_print — print per-call counter values for one (op, counter) pair.
|
||||
* Both "median" and "average" are set to the same per-call value; the outer
|
||||
* loop-spin structure gives the aggregation tool a real distribution.
|
||||
* The IPC line uses a float value multiplied by 1000 for integer storage;
|
||||
* the analysis tool divides by 1000 to recover IPC.
|
||||
*/
|
||||
static void papi_print(const char *op, const char *counter,
|
||||
long long total, int ntests)
|
||||
{
|
||||
unsigned int i;
|
||||
long long per_call = total / ntests;
|
||||
printf("%s_%s: \nmedian: %lld per_call\naverage: %lld per_call\n\n",
|
||||
op, counter, per_call, per_call);
|
||||
}
|
||||
|
||||
/*
|
||||
* papi_bench — read counters around an already-executed NTESTS block.
|
||||
* Call papi_read_before() immediately before the loop and
|
||||
* papi_bench_report() immediately after.
|
||||
*/
|
||||
static long long _papi_before[MAX_EVENTS];
|
||||
static long long _papi_after[MAX_EVENTS];
|
||||
|
||||
static inline void papi_read_before(void) {
|
||||
if (papi_ok) PAPI_read(papi_eventset, _papi_before);
|
||||
}
|
||||
|
||||
static void papi_bench_report(const char *op) {
|
||||
if (!papi_ok) return;
|
||||
PAPI_read(papi_eventset, _papi_after);
|
||||
for (int e = 0; e < n_active; e++) {
|
||||
long long delta = _papi_after[e] - _papi_before[e];
|
||||
papi_print(op, active_names[e], delta, NTESTS);
|
||||
}
|
||||
}
|
||||
|
||||
#define PAPI_BEFORE() papi_read_before()
|
||||
#define PAPI_AFTER(op) papi_bench_report(op)
|
||||
|
||||
#else /* !WITH_PAPI */
|
||||
|
||||
static inline void papi_init(void) {}
|
||||
#define PAPI_BEFORE() ((void)0)
|
||||
#define PAPI_AFTER(op) ((void)0)
|
||||
|
||||
#endif /* WITH_PAPI */
|
||||
|
||||
/* ── Benchmark helpers ───────────────────────────────────────────────────── */
|
||||
|
||||
/*
|
||||
* BENCH(label, body) — time NTESTS executions of body, print results, then
|
||||
* emit PAPI counter lines if enabled.
|
||||
*/
|
||||
#define BENCH(label, body) \
|
||||
do { \
|
||||
PAPI_BEFORE(); \
|
||||
for (unsigned int _i = 0; _i < NTESTS; _i++) { \
|
||||
t[_i] = cpucycles(); \
|
||||
body; \
|
||||
} \
|
||||
print_results(label ": ", t, NTESTS); \
|
||||
PAPI_AFTER(label); \
|
||||
} while (0)
|
||||
|
||||
/* ── Main ────────────────────────────────────────────────────────────────── */
|
||||
|
||||
static uint64_t t[NTESTS];
|
||||
static uint8_t seed[KYBER_SYMBYTES] = {0};
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int nspins = 1;
|
||||
if (argc > 1) {
|
||||
nspins = atoi(argv[1]);
|
||||
if (nspins <= 0) {
|
||||
fprintf(stderr, "usage: %s [nspins]\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
papi_init();
|
||||
|
||||
uint8_t pk[CRYPTO_PUBLICKEYBYTES];
|
||||
uint8_t sk[CRYPTO_SECRETKEYBYTES];
|
||||
uint8_t ct[CRYPTO_CIPHERTEXTBYTES];
|
||||
|
|
@ -30,130 +191,74 @@ int main(void)
|
|||
uint8_t coins32[KYBER_SYMBYTES];
|
||||
uint8_t coins64[2*KYBER_SYMBYTES];
|
||||
polyvec matrix[KYBER_K];
|
||||
poly ap;
|
||||
poly ap;
|
||||
|
||||
randombytes(coins32, KYBER_SYMBYTES);
|
||||
randombytes(coins64, 2*KYBER_SYMBYTES);
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
gen_matrix(matrix, seed, 0);
|
||||
}
|
||||
print_results("gen_a: ", t, NTESTS);
|
||||
for (int spin = 1; spin <= nspins; spin++) {
|
||||
printf("Loop spin: %d\n", spin);
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
poly_getnoise_eta1(&ap, seed, 0);
|
||||
}
|
||||
print_results("poly_getnoise_eta1: ", t, NTESTS);
|
||||
BENCH("gen_a",
|
||||
gen_matrix(matrix, seed, 0));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
poly_getnoise_eta2(&ap, seed, 0);
|
||||
}
|
||||
print_results("poly_getnoise_eta2: ", t, NTESTS);
|
||||
BENCH("poly_getnoise_eta1",
|
||||
poly_getnoise_eta1(&ap, seed, 0));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
poly_ntt(&ap);
|
||||
}
|
||||
print_results("NTT: ", t, NTESTS);
|
||||
BENCH("poly_getnoise_eta2",
|
||||
poly_getnoise_eta2(&ap, seed, 0));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
poly_invntt_tomont(&ap);
|
||||
}
|
||||
print_results("INVNTT: ", t, NTESTS);
|
||||
BENCH("NTT",
|
||||
poly_ntt(&ap));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
polyvec_basemul_acc_montgomery(&ap, &matrix[0], &matrix[1]);
|
||||
}
|
||||
print_results("polyvec_basemul_acc_montgomery: ", t, NTESTS);
|
||||
BENCH("INVNTT",
|
||||
poly_invntt_tomont(&ap));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
poly_tomsg(ct,&ap);
|
||||
}
|
||||
print_results("poly_tomsg: ", t, NTESTS);
|
||||
BENCH("polyvec_basemul_acc_montgomery",
|
||||
polyvec_basemul_acc_montgomery(&ap, &matrix[0], &matrix[1]));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
poly_frommsg(&ap,ct);
|
||||
}
|
||||
print_results("poly_frommsg: ", t, NTESTS);
|
||||
BENCH("poly_tomsg",
|
||||
poly_tomsg(ct, &ap));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
poly_compress(ct,&ap);
|
||||
}
|
||||
print_results("poly_compress: ", t, NTESTS);
|
||||
BENCH("poly_frommsg",
|
||||
poly_frommsg(&ap, ct));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
poly_decompress(&ap,ct);
|
||||
}
|
||||
print_results("poly_decompress: ", t, NTESTS);
|
||||
BENCH("poly_compress",
|
||||
poly_compress(ct, &ap));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
polyvec_compress(ct,&matrix[0]);
|
||||
}
|
||||
print_results("polyvec_compress: ", t, NTESTS);
|
||||
BENCH("poly_decompress",
|
||||
poly_decompress(&ap, ct));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
polyvec_decompress(&matrix[0],ct);
|
||||
}
|
||||
print_results("polyvec_decompress: ", t, NTESTS);
|
||||
BENCH("polyvec_compress",
|
||||
polyvec_compress(ct, &matrix[0]));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
indcpa_keypair_derand(pk, sk, coins32);
|
||||
}
|
||||
print_results("indcpa_keypair: ", t, NTESTS);
|
||||
BENCH("polyvec_decompress",
|
||||
polyvec_decompress(&matrix[0], ct));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
indcpa_enc(ct, key, pk, seed);
|
||||
}
|
||||
print_results("indcpa_enc: ", t, NTESTS);
|
||||
BENCH("indcpa_keypair",
|
||||
indcpa_keypair_derand(pk, sk, coins32));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
indcpa_dec(key, ct, sk);
|
||||
}
|
||||
print_results("indcpa_dec: ", t, NTESTS);
|
||||
BENCH("indcpa_enc",
|
||||
indcpa_enc(ct, key, pk, seed));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
crypto_kem_keypair_derand(pk, sk, coins64);
|
||||
}
|
||||
print_results("kyber_keypair_derand: ", t, NTESTS);
|
||||
BENCH("indcpa_dec",
|
||||
indcpa_dec(key, ct, sk));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
crypto_kem_keypair(pk, sk);
|
||||
}
|
||||
print_results("kyber_keypair: ", t, NTESTS);
|
||||
BENCH("kyber_keypair_derand",
|
||||
crypto_kem_keypair_derand(pk, sk, coins64));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
crypto_kem_enc_derand(ct, key, pk, coins32);
|
||||
}
|
||||
print_results("kyber_encaps_derand: ", t, NTESTS);
|
||||
BENCH("kyber_keypair",
|
||||
crypto_kem_keypair(pk, sk));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
crypto_kem_enc(ct, key, pk);
|
||||
}
|
||||
print_results("kyber_encaps: ", t, NTESTS);
|
||||
BENCH("kyber_encaps_derand",
|
||||
crypto_kem_enc_derand(ct, key, pk, coins32));
|
||||
|
||||
for(i=0;i<NTESTS;i++) {
|
||||
t[i] = cpucycles();
|
||||
crypto_kem_dec(key, ct, sk);
|
||||
BENCH("kyber_encaps",
|
||||
crypto_kem_enc(ct, key, pk));
|
||||
|
||||
BENCH("kyber_decaps",
|
||||
crypto_kem_dec(key, ct, sk));
|
||||
}
|
||||
print_results("kyber_decaps: ", t, NTESTS);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -0,0 +1,10 @@
|
|||
op,m512,m768,m1024
|
||||
INVNTT,1.000,1.000,1.000
|
||||
basemul,1.000,1.000,1.000
|
||||
frommsg,1.000,1.000,1.000
|
||||
NTT,1.000,1.000,1.000
|
||||
iDec,1.000,1.000,1.000
|
||||
iEnc,1.000,1.000,1.000
|
||||
iKeypair,1.000,1.000,1.000
|
||||
gena,1.000,1.000,1.000
|
||||
noise,1.000,1.000,0.999
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
|
||||
frommsg,45.642857142857146,0.0,0.0,49.15384615384615,0.0,0.0,55.38461538461539,0.0,0.0
|
||||
INVNTT,56.26086956521739,0.0,0.0,52.22826086956522,0.0,0.010869565217390686,50.49514563106796,0.009708737864080774,0.0
|
||||
basemul,52.04054054054054,0.0,0.7128841169937061,47.577586206896555,0.0,0.0,41.63333333333333,0.0,0.0
|
||||
NTT,35.526315789473685,0.010526315789476826,2.395032525133054,39.39080459770115,0.44762277951932816,0.0,34.58585858585859,0.010101010101010388,0.3631210059781438
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
|
||||
INVNTT,3.6937872667820737,0.0,0.0001923446816691765,3.6923668525283597,0.0,0.0008062243947173364,186.44660194174756,0.0,0.00970873786408788
|
||||
basemul,3.209016393442623,6.209637357201814e-05,0.00012419274714359219,3.4479583666933546,0.00013344008540183694,0.00013344008540183694,143.55,0.005555555555559977,0.005555555555531555
|
||||
frommsg,3.0156494522691704,0.0,0.0,2.676388888888889,0.0,0.0,148.23076923076923,0.0,0.0
|
||||
NTT,3.691742580076403,0.0010845307227014267,0.0002938583602705158,3.6691004672897196,0.001071270209427766,0.0010718961341775746,126.8989898989899,0.0,1.3050917336631755
|
||||
iDec,3.5713012771855714,0.00023570612000023416,0.00015086802895014628,3.690161977834612,0.0005032782539924341,0.00046931032063479705,114.75503711558855,0.0010604453870683983,0.0010604453870541874
|
||||
iEnc,3.084863236932217,0.0001782560024712332,0.00016342197515761825,3.21233254333646,0.00035364887129318845,0.00028601070699840747,30.157900043693072,0.0029733062283590073,0.001753088869445918
|
||||
iKeypair,3.049990457461021,0.00022319698359352103,0.00019792531427453852,3.207066542768769,0.0006512941219742885,0.0005064778000369863,26.020352541412997,0.0025143592087069067,0.0010972674500919766
|
||||
gena,2.6965550354099146,0.000484369799391704,0.00048237643023396615,2.7162479142988416,0.0006808616189104555,0.0007206686696927811,12.97504909321936,0.0031123799730270463,0.0032871286177282855
|
||||
noise,2.977777777777778,0.0,0.0,3.4190382728164868,0.0,0.0033585837650456085,4.070093457943925,0.0,0.0
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
|
||||
INVNTT,4.082526315789473,0.0,0.00021052631579010495,3.7465224111282844,0.0,0.00019319938176209916,210.7826086956522,0.0,0.010869565217376476
|
||||
basemul,3.2770963704630787,0.0016397780187453748,0.0024627477733942804,3.3996364580628406,0.0,0.0,176.9189189189189,0.0,2.4235468345057427
|
||||
frommsg,3.0109546165884193,0.0,0.0,3.0109546165884193,0.0,0.0,137.42857142857142,0.0,0.0
|
||||
NTT,3.6866764275256223,0.002157843972798279,0.0010798700725032084,3.7303703703703706,0.0,0.0011056225164107758,132.52631578947367,0.0,8.934358367829702
|
||||
iDec,3.742600033957779,0.0006353440528448218,0.00042368257587099833,3.79609644087256,0.0002753054612747441,0.0002753370710646408,133.0543259557344,0.0020120724346099905,0.0020120724346099905
|
||||
iEnc,3.4432478262438213,0.0002504959891131975,0.00030259771432428195,3.530109117810246,0.00039168308874293345,0.00032646898342836295,35.20992436819775,0.0063094659476519155,0.0011068068622037686
|
||||
iKeypair,3.1751089014071656,9.92090538622925e-05,0.00021725496542801537,3.351041039836322,0.00032261099326946763,0.0003142150864068327,27.8438,0.005767606478706,0.005769913982796027
|
||||
gena,2.716878579054644,0.00065187098010977,0.0003882364359895085,2.743237945903567,0.0002940023520188184,0.00046488659667787147,12.781735159817352,0.001369863013698236,0.001369863013698236
|
||||
noise,3.1366495140080044,0.0017923711508616158,0.0,3.433041301627034,0.0,0.0006257822277846437,4.766290182450043,0.0,0.0041446001586527
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
op,refnv_sp,refnv_elo,refnv_ehi,ref_sp,ref_elo,ref_ehi,avx2_sp,avx2_elo,avx2_ehi
|
||||
INVNTT,3.9386252045826513,0.00020458265139122744,0.00020458265139122744,4.006659729448491,0.0008336786786200534,0.00020811654526564638,209.2608695652174,0.010869565217404897,0.010869565217376476
|
||||
basemul,3.306184521797905,0.02605040612313525,0.002795691291897384,3.545207465120493,0.0,0.0,168.67241379310346,0.0,0.0
|
||||
frommsg,2.6708333333333334,0.0,0.0,3.0093896713615025,0.0,0.0,147.92307692307693,0.0,0.0
|
||||
NTT,3.6989152741131632,0.0010840900568913625,0.0,3.681645754304056,0.0,0.0,145.02298850574712,1.6479885057471222,0.0
|
||||
iDec,3.6437147040368125,0.00019424892094210833,0.0003467108483481418,3.800139609964661,0.0003315569175033062,0.00016580015750289334,132.98167938931297,0.001526717557254642,0.003053435114509284
|
||||
iEnc,3.3056977990451344,0.00017231513226034778,0.00016363191105694952,3.48133030817818,0.00022700732330438456,0.00021029337701561346,32.81504567436862,0.004063512322623808,0.0006448146157964629
|
||||
iKeypair,3.109574915272049,0.00020791977755951763,0.00025167432332651174,3.2525126922733425,0.00022163529575136565,0.000286955967172986,24.668559816590246,0.0031435406706883384,0.0007294706127538575
|
||||
gena,2.7088029828997557,0.0007052965244342957,0.0005931348088656918,2.69161485393067,0.0005617516864933059,0.0005061000727368814,10.337667648020936,0.002917034774819527,0.0013902518809292275
|
||||
noise,3.0886524822695036,0.0,0.0008865248226950229,3.4156862745098038,0.0,0.0009803921568627416,4.639147802929427,0.0,0.0013315579227697327
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
|
||||
INVNTT,56.26086956521739,0.0,0.0,52.22826086956522,0.0,0.010869565217390686,50.49514563106796,0.009708737864080774,0.0
|
||||
basemul,52.04054054054054,0.0,0.7128841169937061,47.577586206896555,0.0,0.0,41.63333333333333,0.0,0.0
|
||||
frommsg,45.642857142857146,0.0,0.0,49.15384615384615,0.0,0.0,55.38461538461539,0.0,0.0
|
||||
NTT,35.526315789473685,0.010526315789476826,2.395032525133054,39.39080459770115,0.44762277951932816,0.0,34.58585858585859,0.010101010101010388,0.3631210059781438
|
||||
iDec,35.05030181086519,0.0020120724346099905,0.002012072434602885,34.993893129770996,0.001526717557254642,0.0030534351145021787,31.097560975609756,0.0037115588547180778,0.004241781548248724
|
||||
iEnc,9.974174506548607,0.0014707072125688114,0.0011068068622019922,9.426007522837184,0.0013889971548284308,0.0005373455131660876,9.38816253823144,0.001122140301749397,0.001223049292088163
|
||||
iKeypair,8.309,0.0020613877224544552,0.0018621724344871637,7.584462275948312,0.0012591916511350831,0.0003647353063778169,8.113443296049837,0.0015653318677752992,0.0014866204162533592
|
||||
gena,4.659360730593607,0.00045662100456667076,0.0004566210045657826,3.8406934903500165,0.0009551420262225996,0.0004906771344455052,4.776828000462054,0.0014497812681515398,0.0015659914501355843
|
||||
noise,1.3883579496090357,0.0,0.0012072677822687616,1.3581890812250332,0.0,0.0,1.1904205607476634,0.001168224299065379,0.0
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
op,m512_sp,m512_elo,m512_ehi,m768_sp,m768_elo,m768_ehi,m1024_sp,m1024_elo,m1024_ehi
|
||||
KeyGen,5.351663635391034,0.003951776171514432,0.0036136071694450322,5.515256061277458,0.0010128505412421163,0.0011711084383110304,5.92988426026269,0.009300851394026033,0.008673806818412011
|
||||
Encaps,5.976169109582211,0.0057508565558670455,0.00541865850737544,6.159967741935484,0.0016760536843927198,0.0019668260454155373,6.374312588912245,0.007289526521085499,0.0062883831365772025
|
||||
Decaps,7.12829219051115,0.0038254678112616958,0.002336315747572648,7.078920782076425,0.0017374106397927136,0.001435830107824998,6.920672062603092,0.007041626152989089,0.00611276112038972
|
||||
|
Binary file not shown.
|
|
@ -0,0 +1,30 @@
|
|||
% Figure: cross-param speedup consistency for per-polynomial operations.
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[
|
||||
pqc bar,
|
||||
ybar, ymin=0, ymax=70, ytick distance=10,
|
||||
bar width=6pt,
|
||||
width=\columnwidth, height=5cm,
|
||||
symbolic x coords={frommsg,INVNTT,basemul,NTT},
|
||||
ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
|
||||
legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
|
||||
legend style={at={(0.99,0.99)}, anchor=north east, font=\small},
|
||||
]
|
||||
|
||||
\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
|
||||
col sep=comma]{figures/data/cross_param.csv};
|
||||
|
||||
\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
|
||||
col sep=comma]{figures/data/cross_param.csv};
|
||||
|
||||
\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
|
||||
col sep=comma]{figures/data/cross_param.csv};
|
||||
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
% Figure: speedup decomposition — three panels (one per algorithm), log y-axis.
|
||||
% Data: paper/figures/data/decomp_{mlkem512,768,1024}.csv
|
||||
\begin{tikzpicture}
|
||||
\begin{groupplot}[
|
||||
group style={group size=3 by 1, horizontal sep=1.6cm, ylabels at=edge left},
|
||||
pqc bar,
|
||||
ybar, ymode=log, ymin=1, ymax=500,
|
||||
ytick={1,2,5,10,20,50,100,200},
|
||||
yticklabels={$1\times$,$2\times$,$5\times$,$10\times$,$20\times$,$50\times$,$100\times$,$200\times$},
|
||||
yminorticks=true,
|
||||
width=5.2cm, height=6.5cm,
|
||||
symbolic x coords={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gena,noise},
|
||||
xticklabels={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gen\_a,noise},
|
||||
ylabel={Speedup over \texttt{-O0} ($\times$)},
|
||||
]
|
||||
|
||||
%% ML-KEM-512
|
||||
\nextgroupplot[title={\mlkemk{512}}, bar width=3.5pt]
|
||||
|
||||
\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem512.csv};
|
||||
|
||||
\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem512.csv};
|
||||
|
||||
\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem512.csv};
|
||||
|
||||
%% ML-KEM-768
|
||||
\nextgroupplot[title={\mlkemk{768}}, ylabel={}, bar width=3.5pt]
|
||||
|
||||
\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem768.csv};
|
||||
|
||||
\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem768.csv};
|
||||
|
||||
\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem768.csv};
|
||||
|
||||
%% ML-KEM-1024
|
||||
\nextgroupplot[title={\mlkemk{1024}}, ylabel={}, bar width=3.5pt,
|
||||
legend style={at={(1.0,0.99)}, anchor=north east, font=\scriptsize},
|
||||
legend entries={O3 (no auto-vec), O3 + auto-vec, O3 + hand SIMD}]
|
||||
|
||||
\addplot+[fill=colRefnv, draw=colRefnv!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=refnv_sp, y error plus=refnv_ehi, y error minus=refnv_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem1024.csv};
|
||||
|
||||
\addplot+[fill=colRef, draw=colRef!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=ref_sp, y error plus=ref_ehi, y error minus=ref_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem1024.csv};
|
||||
|
||||
\addplot+[fill=colAvx, draw=colAvx!70!black, opacity=0.85,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=avx2_sp, y error plus=avx2_ehi, y error minus=avx2_elo,
|
||||
col sep=comma]{figures/data/decomp_mlkem1024.csv};
|
||||
|
||||
\end{groupplot}
|
||||
\end{tikzpicture}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
% Figure: hand-SIMD speedup (ref->avx2), three algorithms overlaid, log y-axis.
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[
|
||||
pqc bar,
|
||||
ybar, ymode=log, ymin=1, ymax=100,
|
||||
ytick={1,2,5,10,20,50},
|
||||
yticklabels={$1\times$,$2\times$,$5\times$,$10\times$,$20\times$,$50\times$},
|
||||
yminorticks=true,
|
||||
bar width=5pt,
|
||||
width=\textwidth, height=6cm,
|
||||
symbolic x coords={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gena,noise},
|
||||
xticklabels={INVNTT,basemul,frommsg,NTT,iDec,iEnc,iKeypair,gen\_a,noise},
|
||||
ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
|
||||
legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
|
||||
legend style={at={(0.01,0.99)}, anchor=north west, font=\small},
|
||||
]
|
||||
|
||||
\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
|
||||
col sep=comma]{figures/data/hand_simd.csv};
|
||||
|
||||
\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
|
||||
col sep=comma]{figures/data/hand_simd.csv};
|
||||
|
||||
\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
|
||||
col sep=comma]{figures/data/hand_simd.csv};
|
||||
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
% Figure: KEM-level end-to-end speedup (supplementary).
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[
|
||||
pqc bar,
|
||||
ybar, ymin=0, ymax=9, ytick distance=1,
|
||||
bar width=8pt,
|
||||
width=\columnwidth, height=5cm,
|
||||
symbolic x coords={KeyGen,Encaps,Decaps},
|
||||
ylabel={Speedup \varref{} $\to$ \varavx{} ($\times$)},
|
||||
legend entries={\mlkemk{512}, \mlkemk{768}, \mlkemk{1024}},
|
||||
legend style={at={(0.01,0.99)}, anchor=north west, font=\small},
|
||||
]
|
||||
|
||||
\addplot+[fill=colM512, draw=colM512!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m512_sp, y error plus=m512_ehi, y error minus=m512_elo,
|
||||
col sep=comma]{figures/data/kem_level.csv};
|
||||
|
||||
\addplot+[fill=colM768, draw=colM768!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m768_sp, y error plus=m768_ehi, y error minus=m768_elo,
|
||||
col sep=comma]{figures/data/kem_level.csv};
|
||||
|
||||
\addplot+[fill=colM1024, draw=colM1024!70!black, opacity=0.88,
|
||||
error bars/.cd, y dir=both, y explicit]
|
||||
table[x=op, y=m1024_sp, y error plus=m1024_ehi, y error minus=m1024_elo,
|
||||
col sep=comma]{figures/data/kem_level.csv};
|
||||
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
% ── Shared macros ─────────────────────────────────────────────────────────────
|
||||
|
||||
% Algorithm shorthands
|
||||
\newcommand{\mlkem}{ML-KEM}
|
||||
\newcommand{\mlkemk}[1]{ML-KEM-#1}
|
||||
\newcommand{\mldsa}{ML-DSA}
|
||||
\newcommand{\slhdsa}{SLH-DSA}
|
||||
|
||||
% Variant names (monospace)
|
||||
\newcommand{\varref}{\texttt{ref}}
|
||||
\newcommand{\varrefnv}{\texttt{refnv}}
|
||||
\newcommand{\varrefo}{\texttt{refo0}}
|
||||
\newcommand{\varavx}{\texttt{avx2}}
|
||||
|
||||
% Operation shorthand
|
||||
\newcommand{\op}[1]{\texttt{#1}}
|
||||
|
||||
% Speedup formatting: \speedup{45.6}
|
||||
\newcommand{\speedup}[1]{$#1\times$}
|
||||
|
||||
% Phase 2 / future-work placeholder
|
||||
\newcommand{\phasetwo}[1]{\todo[color=blue!15,caption={Phase 2: #1}]{Phase~2: #1}}
|
||||
\newcommand{\phasethree}[1]{\todo[color=green!15,caption={Phase 3: #1}]{Phase~3: #1}}
|
||||
|
||||
% pgfplots colors (match matplotlib palette)
|
||||
\definecolor{colRefnv}{HTML}{4C72B0} % blue
|
||||
\definecolor{colRef}{HTML}{55A868} % green
|
||||
\definecolor{colAvx}{HTML}{C44E52} % red
|
||||
\definecolor{colM512}{HTML}{4C72B0}
|
||||
\definecolor{colM768}{HTML}{55A868}
|
||||
\definecolor{colM1024}{HTML}{C44E52}
|
||||
|
||||
% Shared pgfplots style.
|
||||
% NOTE: ybar, ymode=log, and bar width CANNOT be used inside \pgfplotsset styles
|
||||
% due to a pgfkeys namespace issue; apply them inline in each axis instead.
|
||||
\pgfplotsset{
|
||||
pqc bar/.style={
|
||||
ymajorgrids=true,
|
||||
yminorgrids=true,
|
||||
grid style={dashed, gray!30},
|
||||
xtick=data,
|
||||
x tick label style={rotate=45, anchor=east, font=\small},
|
||||
legend style={font=\small, at={(0.99,0.99)}, anchor=north east},
|
||||
error bars/error bar style={line width=0.5pt},
|
||||
error bars/error mark options={rotate=90, mark size=1.5pt},
|
||||
},
|
||||
}
|
||||
114
paper/main.tex
114
paper/main.tex
|
|
@ -1,13 +1,22 @@
|
|||
\documentclass[sigconf, nonacm]{acmart}
|
||||
|
||||
% ── Packages ──────────────────────────────────────────────────────────────────
|
||||
\usepackage{booktabs}
|
||||
\usepackage{microtype}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{todonotes}
|
||||
\usepackage{pgfplots}
|
||||
\usepackage{pgfplotstable}
|
||||
\usepgfplotslibrary{groupplots}
|
||||
\pgfplotsset{compat=1.18}
|
||||
|
||||
% ── Metadata (fill in when ready) ────────────────────────────────────────────
|
||||
\title{SIMD Optimization in Post-Quantum Cryptography:\\
|
||||
A Micro-Architecture and Energy Analysis}
|
||||
\input{macros}
|
||||
|
||||
% ── Metadata ──────────────────────────────────────────────────────────────────
|
||||
% NOTE: Title targets Phase 1 (ML-KEM, x86 AVX2).
|
||||
% Update when Phase 2/3 material (ML-DSA, ARM, energy) is incorporated.
|
||||
\title{Where Does SIMD Help Post-Quantum Cryptography?\\
|
||||
A Micro-Architectural Study of ML-KEM on x86 AVX2}
|
||||
|
||||
\author{Levi Neuwirth}
|
||||
\affiliation{%
|
||||
|
|
@ -18,103 +27,30 @@
|
|||
}
|
||||
\email{ln@levineuwirth.org}
|
||||
|
||||
% ── Abstract ──────────────────────────────────────────────────────────────────
|
||||
\begin{abstract}
|
||||
TODO
|
||||
\input{sections/abstract}
|
||||
\end{abstract}
|
||||
|
||||
\keywords{post-quantum cryptography, ML-KEM, Kyber, SIMD, AVX2, performance
|
||||
analysis, energy efficiency, micro-architecture}
|
||||
analysis, micro-architecture, benchmark reproducibility}
|
||||
|
||||
% ─────────────────────────────────────────────────────────────────────────────
|
||||
\begin{document}
|
||||
\maketitle
|
||||
|
||||
% ── 1. Introduction ──────────────────────────────────────────────────────────
|
||||
\section{Introduction}
|
||||
\label{sec:intro}
|
||||
\input{sections/intro}
|
||||
\input{sections/background}
|
||||
\input{sections/methodology}
|
||||
\input{sections/results}
|
||||
\input{sections/discussion}
|
||||
\input{sections/related}
|
||||
\input{sections/conclusion}
|
||||
|
||||
TODO
|
||||
|
||||
% ── 2. Background ────────────────────────────────────────────────────────────
|
||||
\section{Background}
|
||||
\label{sec:background}
|
||||
|
||||
\subsection{ML-KEM / Kyber}
|
||||
TODO: Module-LWE, ring structure, NTT.
|
||||
|
||||
\subsection{SIMD on x86-64}
|
||||
TODO: AVX2 register model, relevant instructions for polynomial arithmetic.
|
||||
|
||||
\subsection{Hardware Performance Counters and RAPL}
|
||||
TODO: perf, PAPI, Intel RAPL energy domains.
|
||||
|
||||
% ── 3. Methodology ───────────────────────────────────────────────────────────
|
||||
\section{Methodology}
|
||||
\label{sec:methodology}
|
||||
|
||||
\subsection{Implementation Variants}
|
||||
TODO: ref (AVX2 intrinsics), refnv (scalar, no vectorization), refo0 (unoptimized
|
||||
baseline).
|
||||
|
||||
\subsection{Benchmark Harness}
|
||||
TODO: cycle counter, iteration count, statistical methodology, OSCAR node spec.
|
||||
|
||||
\subsection{Hardware Counter Collection}
|
||||
TODO: PAPI events selected and why.
|
||||
|
||||
\subsection{Energy Measurement}
|
||||
TODO: RAPL pkg + DRAM domains, joules-per-operation derivation.
|
||||
|
||||
% ── 4. Results ───────────────────────────────────────────────────────────────
|
||||
\section{Results}
|
||||
\label{sec:results}
|
||||
|
||||
\subsection{Cycle Counts}
|
||||
|
||||
\begin{table}[h]
|
||||
\caption{Median cycle counts, ML-KEM-512, 10\,000 iterations.}
|
||||
\label{tab:cycles512}
|
||||
\begin{tabular}{lrrr}
|
||||
\toprule
|
||||
Operation & ref (AVX2) & refnv (scalar) & speedup \\
|
||||
\midrule
|
||||
NTT & TODO & TODO & TODO$\times$ \\
|
||||
INVNTT & TODO & TODO & TODO$\times$ \\
|
||||
polyvec\_basemul\_acc & TODO & TODO & TODO$\times$ \\
|
||||
indcpa\_keypair & TODO & TODO & TODO$\times$ \\
|
||||
indcpa\_enc & TODO & TODO & TODO$\times$ \\
|
||||
kyber\_encaps & TODO & TODO & TODO$\times$ \\
|
||||
kyber\_decaps & TODO & TODO & TODO$\times$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{Hardware Counter Breakdown}
|
||||
TODO: IPC, cache miss rates, branch mispredictions.
|
||||
|
||||
\subsection{Energy Efficiency}
|
||||
TODO: joules/operation, EDP comparison.
|
||||
|
||||
% ── 5. Discussion ────────────────────────────────────────────────────────────
|
||||
\section{Discussion}
|
||||
\label{sec:discussion}
|
||||
|
||||
TODO: mechanistic explanation of where the speedup comes from.
|
||||
|
||||
% ── 6. Related Work ──────────────────────────────────────────────────────────
|
||||
\section{Related Work}
|
||||
\label{sec:related}
|
||||
|
||||
TODO
|
||||
|
||||
% ── 7. Conclusion ────────────────────────────────────────────────────────────
|
||||
\section{Conclusion}
|
||||
\label{sec:conclusion}
|
||||
|
||||
TODO
|
||||
|
||||
% ── References ───────────────────────────────────────────────────────────────
|
||||
\bibliographystyle{ACM-Reference-Format}
|
||||
\bibliography{refs}
|
||||
|
||||
\appendix
|
||||
\input{sections/supplementary}
|
||||
|
||||
\end{document}
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@
|
|||
|
||||
@misc{kyber-avx2,
|
||||
author = {Schwabe, Peter and Seiler, Gregor},
|
||||
title = {{Better Bootstrapping in Fully Homomorphic Encryption}},
|
||||
title = {{High-Speed {AVX2} Implementation of the {Kyber} Key Encapsulation Mechanism}},
|
||||
note = {AVX2 implementation in the pqclean project},
|
||||
url = {https://github.com/pq-crystals/kyber},
|
||||
}
|
||||
|
|
@ -97,3 +97,45 @@
|
|||
title = {{pqm4: Post-quantum crypto library for the ARM Cortex-M4}},
|
||||
url = {https://github.com/mupq/pqm4},
|
||||
}
|
||||
|
||||
@misc{supercop,
|
||||
author = {Bernstein, Daniel J. and Lange, Tanja},
|
||||
title = {{SUPERCOP: System for Unified Performance Evaluation Related to
|
||||
Cryptographic Operations and Primitives}},
|
||||
url = {https://bench.cr.yp.to/supercop.html},
|
||||
}
|
||||
|
||||
@misc{papi,
|
||||
author = {{Innovative Computing Laboratory, University of Tennessee}},
|
||||
title = {{PAPI: Performance Application Programming Interface}},
|
||||
url = {https://icl.utk.edu/papi/},
|
||||
}
|
||||
|
||||
@inproceedings{gueron2014,
|
||||
author = {Gueron, Shay and Krasnov, Vlad},
|
||||
title = {{Fast Garbling of Circuits Under Standard Assumptions}},
|
||||
booktitle = {ACM CCS},
|
||||
year = {2013},
|
||||
note = {See also: Intel white paper on AES-GCM with AVX2},
|
||||
}
|
||||
|
||||
@misc{bernstein2006,
|
||||
author = {Bernstein, Daniel J.},
|
||||
title = {{Curve25519: new Diffie-Hellman speed records}},
|
||||
year = {2006},
|
||||
url = {https://cr.yp.to/ecdh.html},
|
||||
}
|
||||
|
||||
@misc{cachetime,
|
||||
author = {Bernstein, Daniel J. and Schwabe, Peter},
|
||||
title = {{New AES Software Speed Records}},
|
||||
year = {2008},
|
||||
url = {https://cr.yp.to/aes-speed.html},
|
||||
}
|
||||
|
||||
@misc{bettini2024,
|
||||
author = {{Google Security Blog}},
|
||||
title = {{Protecting Chrome Traffic with Hybrid Kyber KEM}},
|
||||
year = {2023},
|
||||
url = {https://security.googleblog.com/2023/08/protecting-chrome-traffic-with-hybrid.html},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,31 @@
|
|||
Post-quantum cryptography (PQC) standards are being deployed at scale following
|
||||
NIST's 2024 finalization of \mlkem{} (FIPS~203), \mldsa{} (FIPS~204), and
|
||||
\slhdsa{} (FIPS~205). Hand-written SIMD implementations of these algorithms
|
||||
report dramatic performance advantages, yet the mechanistic origins of these
|
||||
speedups are rarely quantified with statistical rigor.
|
||||
|
||||
We present the first systematic empirical decomposition of SIMD speedup across
|
||||
the operations of \mlkem{} (Kyber) on Intel x86-64 with AVX2. Using a
|
||||
reproducible benchmark harness across four compilation variants---\varrefo{}
|
||||
(unoptimized), \varrefnv{} (O3, auto-vectorization disabled), \varref{}
|
||||
(O3 with auto-vectorization), and \varavx{} (hand-written AVX2 intrinsics)---we
|
||||
isolate three distinct contributions: compiler optimization, compiler
|
||||
auto-vectorization, and hand-written SIMD. All measurements are conducted on a
|
||||
pinned core of an Intel Xeon Platinum 8268 on Brown University's OSCAR HPC
|
||||
cluster, with statistical significance assessed via Mann-Whitney U tests and
|
||||
Cliff's~$\delta$ effect-size analysis across $n \ge 2{,}000$ independent
|
||||
observations per group.
|
||||
|
||||
Our key findings are: (1) hand-written AVX2 assembly accounts for
|
||||
\speedup{35}--\speedup{56} speedup over compiler-optimized C for the dominant
|
||||
arithmetic operations (NTT, INVNTT, base multiplication), with Cliff's
|
||||
$\delta = +1.000$ in every comparison---meaning AVX2 is faster in
|
||||
\emph{every single} observation pair; (2) GCC's auto-vectorizer contributes
|
||||
negligibly or even slightly negatively for NTT-based operations because the
|
||||
modular reduction step prevents vectorization; (3) end-to-end KEM speedups of
|
||||
\speedup{5.4}--\speedup{7.1} result from a weighted combination of large
|
||||
per-operation gains and smaller gains in SHAKE-heavy operations (gen\_a:
|
||||
\speedup{3.8}--\speedup{4.7}; noise sampling: \speedup{1.2}--\speedup{1.4}).
|
||||
|
||||
The benchmark harness, raw data, and analysis pipeline are released as an open
|
||||
reproducible artifact.
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
% ── 2. Background ─────────────────────────────────────────────────────────────
|
||||
\section{Background}
|
||||
\label{sec:background}
|
||||
|
||||
\subsection{ML-KEM and the Number Theoretic Transform}
|
||||
|
||||
\mlkem{}~\cite{fips203} is a key encapsulation mechanism built on the
|
||||
Module-Learning-With-Errors (Module-LWE) problem. Its security parameter
|
||||
$k \in \{2, 3, 4\}$ controls the module dimension, yielding the three
|
||||
instantiations \mlkemk{512}, \mlkemk{768}, and \mlkemk{1024}. The scheme
|
||||
operates on polynomials in $\mathbb{Z}_q[x]/(x^{256}+1)$ with $q = 3329$.
|
||||
|
||||
The computational core is polynomial multiplication, which \mlkem{} evaluates
|
||||
using the Number Theoretic Transform (NTT)~\cite{ntt-survey}. The NTT is a
|
||||
modular analog of the Fast Fourier Transform that reduces schoolbook
|
||||
$O(n^2)$ polynomial multiplication to $O(n \log n)$ pointwise operations.
|
||||
For $n = 256$ coefficients and $q = 3329$, the NTT can be computed using a
|
||||
specialized radix-2 Cooley-Tukey butterfly operating over 128 size-2 NTTs
|
||||
in the NTT domain.
|
||||
|
||||
The primitive operations benchmarked in this paper are:
|
||||
\begin{itemize}
|
||||
\item \op{NTT} / \op{INVNTT}: forward and inverse NTT over a single
|
||||
polynomial ($n = 256$).
|
||||
\item \op{basemul}: pointwise multiplication in the NTT domain (base
|
||||
multiplication of two NTT-domain polynomials).
|
||||
\item \op{poly\_frommsg}: encodes a 32-byte message into a polynomial.
|
||||
\item \op{gen\_a}: generates the public matrix $\mathbf{A}$ by expanding
|
||||
a seed with SHAKE-128.
|
||||
\item \op{poly\_getnoise\_eta\{1,2\}}: samples a centered binomial
|
||||
distribution (CBD) noise polynomial using SHAKE-256 output.
|
||||
\item \op{indcpa\_\{keypair, enc, dec\}}: full IND-CPA key generation,
|
||||
encryption, and decryption.
|
||||
\end{itemize}
|
||||
|
||||
\subsection{AVX2 SIMD on x86-64}
|
||||
|
||||
Intel's Advanced Vector Extensions 2 (AVX2) extends the YMM register file to
|
||||
256-bit width, accommodating sixteen 16-bit integers simultaneously. The
|
||||
\mlkem{} AVX2 implementation~\cite{kyber-avx2} by Schwabe and Seiler uses
|
||||
hand-written assembly intrinsics rather than compiler-generated vectorized code.
|
||||
|
||||
The key instruction patterns exploited are:
|
||||
\begin{itemize}
|
||||
\item \texttt{vpaddw} / \texttt{vpsubw}: packed 16-bit addition/subtraction,
|
||||
operating on 16 coefficients per instruction.
|
||||
\item \texttt{vpmullw} / \texttt{vpmulhw}: packed 16-bit low/high multiply,
|
||||
used to implement 16-wide Montgomery reduction.
|
||||
\item \texttt{vpunpcklwd} / \texttt{vpunpckhwd}: interleave operations for
|
||||
the NTT butterfly shuffle pattern.
|
||||
\end{itemize}
|
||||
|
||||
Because \mlkem{} coefficients are 16-bit integers and the NTT butterfly
|
||||
operates independently on 16 coefficient pairs per round, AVX2 offers a
|
||||
theoretical $16\times$ instruction-count reduction for arithmetic steps. As
|
||||
\S\ref{sec:results} shows, observed speedups \emph{exceed} $16\times$ for
|
||||
\op{INVNTT} and \op{basemul} due to additional instruction-level parallelism
|
||||
(ILP) in the unrolled hand-written loops.
|
||||
|
||||
\subsection{Compilation Variants}
|
||||
|
||||
To isolate distinct sources of speedup, we define four compilation variants
|
||||
(detailed in §\ref{sec:methodology}):
|
||||
|
||||
\begin{description}
|
||||
\item[\varrefo{}] Compiled at \texttt{-O0}: no optimization. Serves as the
|
||||
unoptimized baseline.
|
||||
\item[\varrefnv{}] Compiled at \texttt{-O3 -fno-tree-vectorize}: full
|
||||
compiler optimization but with auto-vectorization disabled. Isolates
|
||||
the contribution of general compiler optimizations (register
|
||||
allocation, loop unrolling, constant propagation) from SIMD.
|
||||
\item[\varref{}] Compiled at \texttt{-O3}: full optimization including GCC's
|
||||
auto-vectorizer. Represents what production deployments without
|
||||
hand-tuned SIMD would achieve.
|
||||
\item[\varavx{}] Hand-written AVX2 assembly: the production-quality
|
||||
optimized implementation.
|
||||
\end{description}
|
||||
|
||||
\subsection{Hardware Performance Counters and Energy}
|
||||
\label{sec:bg:papi}
|
||||
\phasetwo{Expand with PAPI and RAPL background once data is collected.}
|
||||
|
||||
Hardware performance counters (accessed via PAPI~\cite{papi} or Linux
|
||||
\texttt{perf\_event}) allow measuring IPC, cache miss rates, and branch
|
||||
mispredictions at the instruction level. Intel RAPL~\cite{rapl} provides
|
||||
package- and DRAM-domain energy readings. These will be incorporated in
|
||||
Phase~2 to provide a mechanistic hardware-level explanation complementing the
|
||||
cycle-count analysis presented here.
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
% ── 7. Conclusion ─────────────────────────────────────────────────────────────
|
||||
\section{Conclusion}
|
||||
\label{sec:conclusion}
|
||||
|
||||
We presented the first statistically rigorous decomposition of SIMD speedup
|
||||
in \mlkem{} (Kyber), isolating the contributions of compiler optimization,
|
||||
auto-vectorization, and hand-written AVX2 assembly. Our main findings are:
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Hand-written SIMD is necessary, not optional.} GCC's
|
||||
auto-vectorizer provides negligible benefit ($<10\%$) for NTT-based
|
||||
arithmetic, and for \op{INVNTT} actually produces slightly slower code
|
||||
than non-vectorized O3. The full \speedup{35}--\speedup{56} speedup
|
||||
on arithmetic operations comes entirely from hand-written assembly.
|
||||
|
||||
\item \textbf{The distribution of SIMD benefit across operations is
|
||||
highly non-uniform.} Arithmetic operations (NTT, INVNTT, basemul,
|
||||
frommsg) achieve \speedup{35}--\speedup{56}; SHAKE-based expansion
|
||||
(gen\_a) achieves only \speedup{3.8}--\speedup{4.7}; and noise
|
||||
sampling achieves \speedup{1.2}--\speedup{1.4}. The bottleneck shifts
|
||||
from compute to memory bandwidth for non-arithmetic operations.
|
||||
|
||||
\item \textbf{The statistical signal is overwhelming.} Cliff's $\delta =
|
||||
+1.000$ for nearly all operations means AVX2 is faster than \varref{}
|
||||
in every single observation pair across $n \ge 2{,}000$ measurements.
|
||||
These results are stable across three \mlkem{} parameter sets.
|
||||
|
||||
\item \textbf{Context affects even isolated micro-benchmarks.} The NTT
|
||||
speedup varies by 13\% across parameter sets despite identical
|
||||
polynomial dimensions, attributed to cache-state effects from
|
||||
surrounding polyvec operations.
|
||||
\end{enumerate}
|
||||
|
||||
\paragraph{Future work.}
|
||||
Planned extensions include: hardware performance counter profiles (IPC, cache
|
||||
miss rates) via PAPI to validate the mechanistic explanations in
|
||||
§\ref{sec:discussion}; energy measurement via Intel RAPL; extension to
|
||||
\mldsa{} (Dilithium) and \slhdsa{} (SPHINCS+) with the same harness; and
|
||||
cross-ISA comparison with ARM NEON/SVE (Graviton3) and RISC-V V. A compiler
|
||||
version sensitivity study (GCC 11--14, Clang 14--17) will characterize how
|
||||
stable the auto-vectorization gap is across compiler releases.
|
||||
|
||||
\paragraph{Artifact.}
|
||||
The benchmark harness, SLURM job templates, raw cycle-count data, analysis
|
||||
pipeline, and this paper are released at
|
||||
\url{https://github.com/lneuwirth/where-simd-helps} under an open license.
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
% ── 5. Discussion ─────────────────────────────────────────────────────────────
|
||||
\section{Discussion}
|
||||
\label{sec:discussion}
|
||||
|
||||
\subsection{Why Arithmetic Operations Benefit Most}
|
||||
|
||||
The NTT butterfly loop processes 128 pairs of 16-bit coefficients per forward
|
||||
transform. In the scalar \varref{} path, each butterfly requires a modular
|
||||
multiplication (implemented as a Barrett reduction), an addition, and a
|
||||
subtraction---roughly 10--15 instructions per pair with data-dependent
|
||||
serialization through the multiply-add chain. The AVX2 path uses
|
||||
\texttt{vpmullw}/\texttt{vpmulhw} to compute 16 Montgomery multiplications
|
||||
per instruction, processing an entire butterfly layer in \mbox{$\sim$16}
|
||||
fewer instruction cycles.
|
||||
|
||||
The observed INVNTT speedup of \speedup{56.3} at \mlkemk{512} \emph{exceeds}
|
||||
the theoretical $16\times$ register-width advantage. We attribute this to
|
||||
two compounding factors: (1) the unrolled hand-written assembly eliminates
|
||||
loop overhead and branch prediction pressure; (2) the inverse NTT has a
|
||||
slightly different access pattern than the forward NTT that benefits from
|
||||
out-of-order execution with wide issue ports on the Cascade Lake
|
||||
microarchitecture. \phasetwo{Confirm with IPC and port utilisation counters.}
|
||||
|
||||
\subsection{Why the Compiler Cannot Auto-Vectorise NTT}
|
||||
|
||||
A striking result is that \varref{} and \varrefnv{} perform nearly identically
|
||||
for all arithmetic operations ($\leq 10\%$ difference, with \varrefnv{}
|
||||
occasionally faster). This means GCC's tree-vectorizer produces no net benefit
|
||||
for the NTT inner loop.
|
||||
|
||||
The fundamental obstacle is \emph{modular reduction}: Barrett reduction and
|
||||
Montgomery reduction require a multiply-high operation (\texttt{vpmulhw}) that
|
||||
GCC cannot express through the scalar multiply-add chain it generates for the
|
||||
C reference code. Additionally, the NTT butterfly requires coefficient
|
||||
interleaving (odd/even index separation) that the auto-vectorizer does not
|
||||
recognize as a known shuffle pattern. The hand-written assembly encodes these
|
||||
patterns directly in \texttt{vpunpck*} instructions.
|
||||
|
||||
This finding has practical significance: developers porting \mlkem{} to new
|
||||
platforms cannot rely on the compiler to provide SIMD speedup for the NTT.
|
||||
Hand-written intrinsics or architecture-specific assembly are necessary.
|
||||
|
||||
\subsection{Why SHAKE Operations Benefit Less}
|
||||
|
||||
\op{gen\_a} expands a public seed into a $k \times k$ matrix of polynomials
|
||||
using SHAKE-128. Each Keccak-f[1600] permutation operates on a 200-byte state
|
||||
that does not fit in AVX2 registers (16 lanes $\times$ 16 bits = 32 bytes). The
|
||||
AVX2 Keccak implementation achieves \speedup{3.8}--\speedup{4.7} primarily by
|
||||
batching multiple independent absorb phases and using vectorized XOR across
|
||||
parallel state words---a different kind of SIMD parallelism than the arithmetic
|
||||
path. The bottleneck shifts to memory bandwidth as the permutation state is
|
||||
repeatedly loaded from and stored to L1 cache.
|
||||
|
||||
\subsection{Why Noise Sampling Barely Benefits}
|
||||
|
||||
CBD noise sampling reads adjacent bits from a byte stream and computes
|
||||
Hamming weights. The scalar path already uses bitwise operations with no
|
||||
data-dependent branches (constant-time design). The AVX2 path can batch the
|
||||
popcount computation but remains bottlenecked by the sequential bitstream
|
||||
access pattern. The small \speedup{1.2}--\speedup{1.4} speedup reflects
|
||||
this fundamental memory access bottleneck rather than compute limitation.
|
||||
|
||||
\subsection{NTT Cache-State Variation Across Parameter Sets}
|
||||
|
||||
The \speedup{13\%} variation in NTT speedup across parameter sets
|
||||
(§\ref{sec:results:crossparams}) despite identical polynomial dimensions
|
||||
suggests that execution context matters even for nominally isolated
|
||||
micro-benchmarks. Higher-$k$ polyvec operations that precede each NTT call
|
||||
have larger memory footprints ($k$ more polynomials in the accumulation
|
||||
buffer), potentially evicting portions of the instruction cache or L1 data
|
||||
cache that the scalar NTT path relies on. The AVX2 path is less affected
|
||||
because it maintains more coefficient state in vector registers between
|
||||
operations. \phasetwo{Verify with L1/L2 miss counters split by scalar vs AVX2.}
|
||||
|
||||
\subsection{Implications for Deployment}
|
||||
|
||||
The end-to-end KEM speedups of \speedup{5.4}--\speedup{7.1} (Appendix,
|
||||
Figure~\ref{fig:kemlevel}) represent the practical deployment benefit.
|
||||
Deployments that cannot use hand-written SIMD (e.g., some constrained
|
||||
environments, or languages without inline assembly support) should expect
|
||||
performance within a factor of $5$--$7$ of the AVX2 reference.
|
||||
Auto-vectorization provides essentially no shortcut: the gap between
|
||||
compiler-optimized C and hand-written SIMD is the full $5$--$7\times$, not
|
||||
a fraction of it.
|
||||
|
||||
\subsection{Limitations}
|
||||
|
||||
\paragraph{No hardware counter data (Phase~1).} The mechanistic explanations
|
||||
in this section are derived analytically from instruction-set structure and
|
||||
publicly known microarchitecture details. Phase~2 will validate these with
|
||||
PAPI counter measurements. \phasetwo{PAPI counters: IPC, cache miss rates.}
|
||||
|
||||
\paragraph{Single microarchitecture.} All results are from Intel Cascade Lake
|
||||
(Xeon Platinum 8268). Speedup ratios may differ on other AVX2 hosts (e.g.,
|
||||
Intel Skylake, AMD Zen 3/4) due to differences in execution port configuration,
|
||||
vector throughput, and out-of-order window size.
|
||||
\phasethree{Repeat on AMD Zen, ARM Graviton3, RISC-V.}
|
||||
|
||||
\paragraph{Frequency scaling.} OSCAR nodes may operate in a power-capped mode
|
||||
that reduces Turbo Boost frequency under sustained SIMD load. RDTSC counts
|
||||
wall-clock ticks at the invariant TSC frequency, which may differ from the
|
||||
actual core frequency during SIMD execution.
|
||||
\phasetwo{Characterize frequency during benchmarks; consider RAPL-normalized
|
||||
cycle counts.}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
% ── 1. Introduction ───────────────────────────────────────────────────────────
|
||||
\section{Introduction}
|
||||
\label{sec:intro}
|
||||
|
||||
The 2024 NIST post-quantum cryptography standards~\cite{fips203,fips204,fips205}
|
||||
mark a turning point in deployed cryptography. \mlkem{} (Module-Lattice Key
|
||||
Encapsulation Mechanism, FIPS~203) is already being integrated into TLS~1.3 by
|
||||
major browser vendors~\cite{bettini2024} and is planned for inclusion in OpenSSH.
|
||||
At deployment scale, performance matters: a server handling thousands of TLS
|
||||
handshakes per second experiences a non-trivial computational overhead from
|
||||
replacing elliptic-curve key exchange with a lattice-based KEM.
|
||||
|
||||
Reference implementations of \mlkem{} ship with hand-optimized AVX2 assembly
|
||||
for the dominant operations~\cite{kyber-avx2}. Benchmarks routinely report
|
||||
that the AVX2 path is ``$5$--$7\times$ faster'' than the portable C reference.
|
||||
However, such top-level numbers conflate several distinct phenomena:
|
||||
compiler optimization, compiler auto-vectorization, and hand-written SIMD. They
|
||||
also say nothing about \emph{which} operations drive the speedup or \emph{why}
|
||||
the assembly is faster than what a compiler can produce automatically.
|
||||
|
||||
\subsection*{Contributions}
|
||||
|
||||
This paper makes the following contributions:
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Three-way speedup decomposition.} We isolate compiler
|
||||
optimization, auto-vectorization, and hand-written SIMD as separate
|
||||
factors using four compilation variants (§\ref{sec:methodology}).
|
||||
|
||||
\item \textbf{Statistically rigorous benchmarking.} All comparisons are
|
||||
backed by Mann-Whitney U tests and Cliff's~$\delta$ effect-size
|
||||
analysis over $n \ge 2{,}000$ independent observations, with
|
||||
bootstrapped 95\% confidence intervals on speedup ratios
|
||||
(§\ref{sec:results}).
|
||||
|
||||
\item \textbf{Mechanistic analysis without hardware counters.} We explain
|
||||
the quantitative speedup pattern analytically from the structure of
|
||||
the NTT butterfly, Montgomery multiplication, and the SHAKE-128
|
||||
permutation (§\ref{sec:discussion}).
|
||||
|
||||
\item \textbf{Open reproducible artifact.} The full pipeline from raw
|
||||
SLURM outputs to publication figures is released publicly.
|
||||
\end{enumerate}
|
||||
|
||||
\subsection*{Scope and roadmap}
|
||||
|
||||
This report covers Phase~1 of a broader study: \mlkem{} on Intel x86-64 with
|
||||
AVX2. Planned extensions include hardware performance counter profiles (PAPI),
|
||||
energy measurement (Intel RAPL), extension to \mldsa{} (Dilithium), and
|
||||
cross-ISA comparison with ARM NEON/SVE and RISC-V V. Those results will be
|
||||
incorporated in subsequent revisions.
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
% ── 3. Methodology ────────────────────────────────────────────────────────────
|
||||
\section{Methodology}
|
||||
\label{sec:methodology}
|
||||
|
||||
\subsection{Implementation Source}
|
||||
|
||||
We use the \mlkem{} reference implementation from the \texttt{pq-crystals/kyber}
|
||||
repository~\cite{kyber-avx2}, which provides both a portable C reference
|
||||
(\varref{} / \varrefnv{}) and hand-written AVX2 assembly (\varavx{}). The
|
||||
implementation targets the CRYSTALS-Kyber specification, functionally identical
|
||||
to FIPS~203.
|
||||
|
||||
\subsection{Compilation Variants}
|
||||
\label{sec:meth:variants}
|
||||
|
||||
We compile the same C source under four variant configurations using GCC 13.3.0:
|
||||
|
||||
\begin{description}
|
||||
\item[\varrefo{}] \texttt{-O0}: unoptimized. Every operation is loaded/stored
|
||||
through memory; no inlining, no register allocation. Establishes a
|
||||
reproducible performance floor.
|
||||
\item[\varrefnv{}] \texttt{-O3 -fno-tree-vectorize}: aggressive scalar
|
||||
optimization but with the tree-vectorizer disabled. Isolates the
|
||||
auto-vectorization contribution from general O3 optimizations.
|
||||
\item[\varref{}] \texttt{-O3}: full optimization with GCC auto-vectorization
|
||||
enabled. Represents realistic scalar-C performance.
|
||||
\item[\varavx{}] \texttt{-O3} with hand-written AVX2 assembly linked in:
|
||||
the production optimized path.
|
||||
\end{description}
|
||||
|
||||
All four variants are built with position-independent code and identical linker
|
||||
flags. The AVX2 assembly sources use the same \texttt{KYBER\_NAMESPACE} macro
|
||||
as the C sources to prevent symbol collisions.
|
||||
|
||||
\subsection{Benchmark Harness}
|
||||
|
||||
Each binary runs a \emph{spin loop}: $N = 1{,}000$ outer iterations (spins),
|
||||
each performing 20~repetitions of the target operation followed by a median
|
||||
and mean cycle count report via \texttt{RDTSC}. Using the median of 20
|
||||
repetitions per spin suppresses within-spin outliers; collecting 1{,}000 spins
|
||||
produces a distribution of 1{,}000 median observations per binary invocation.
|
||||
|
||||
Two independent job submissions per (algorithm, variant) pair yield
|
||||
$n \ge 2{,}000$ independent observations per group (3{,}000 for \varref{} and
|
||||
\varavx{}, which had a third clean run). All runs used \texttt{taskset} to pin
|
||||
to a single logical core, preventing OS scheduling interference.
|
||||
|
||||
\subsection{Hardware Platform}
|
||||
|
||||
All benchmarks were conducted on Brown University's OSCAR HPC cluster, node
|
||||
\texttt{node2334}, pinned via SLURM's \texttt{{-}{-}nodelist} directive to
|
||||
ensure all variants measured on identical hardware. The node specifications are:
|
||||
|
||||
\begin{center}
|
||||
\small
|
||||
\begin{tabular}{ll}
|
||||
\toprule
|
||||
CPU model & Intel Xeon Platinum 8268 (Cascade Lake) \\
|
||||
Clock speed & 2.90\,GHz base \\
|
||||
ISA extensions & SSE4.2, AVX, AVX2, AVX-512F \\
|
||||
L1D cache & 32\,KB (per core) \\
|
||||
L2 cache & 1\,MB (per core) \\
|
||||
L3 cache & 35.75\,MB (shared) \\
|
||||
OS & Linux (kernel 3.10) \\
|
||||
Compiler & GCC 13.3.0 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
|
||||
\noindent\textbf{Reproducibility note:} The \texttt{perf\_event\_paranoid}
|
||||
setting on OSCAR nodes is 2, which prevents unprivileged access to hardware
|
||||
performance counters. Hardware counter data (IPC, cache miss rates) will be
|
||||
collected in Phase~2 after requesting elevated permissions from the cluster
|
||||
administrators. \phasetwo{Hardware counter collection via PAPI.}
|
||||
|
||||
\subsection{Statistical Methodology}
|
||||
\label{sec:meth:stats}
|
||||
|
||||
Cycle count distributions are right-skewed with occasional outliers from
|
||||
OS interrupts and cache-cold starts (Figure~\ref{fig:distributions}). We
|
||||
therefore use nonparametric statistics throughout:
|
||||
|
||||
\begin{itemize}
|
||||
\item \textbf{Speedup}: ratio of group medians, $\hat{s} =
|
||||
\text{median}(X_\text{baseline}) / \text{median}(X_\text{variant})$.
|
||||
\item \textbf{Confidence interval}: 95\% bootstrap CI on $\hat{s}$,
|
||||
computed by resampling both groups independently $B = 5{,}000$ times
|
||||
with replacement.
|
||||
\item \textbf{Mann-Whitney U test}: one-sided test for the hypothesis that
|
||||
the variant distribution is stochastically smaller than the baseline
|
||||
($H_1: P(X_\text{variant} < X_\text{baseline}) > 0.5$).
|
||||
\item \textbf{Cliff's $\delta$}: effect size defined as $\delta =
|
||||
[P(X_\text{variant} < X_\text{baseline}) -
|
||||
P(X_\text{variant} > X_\text{baseline})]$, derived from the
|
||||
Mann-Whitney U statistic. $\delta = +1$ indicates that
|
||||
\emph{every} variant observation is faster than \emph{every}
|
||||
baseline observation.
|
||||
\end{itemize}
|
||||
|
||||
\subsection{Energy Measurement}
|
||||
\label{sec:meth:energy}
|
||||
\phasetwo{Intel RAPL (pkg + DRAM domains), EDP computation, per-operation joules.}
|
||||
Energy measurements via Intel RAPL will be incorporated in Phase~2. The harness
|
||||
already includes conditional RAPL support (\texttt{-DWITH\_RAPL=ON}) pending
|
||||
appropriate system permissions.
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
% ── 6. Related Work ───────────────────────────────────────────────────────────
|
||||
\section{Related Work}
|
||||
\label{sec:related}
|
||||
|
||||
\paragraph{ML-KEM / Kyber implementations.}
|
||||
The AVX2 implementation studied here was developed by Schwabe and
|
||||
Seiler~\cite{kyber-avx2} and forms the optimized path in both the
|
||||
\texttt{pq-crystals/kyber} reference repository and
|
||||
PQClean~\cite{pqclean}. Bos et al.~\cite{kyber2018} describe the original
|
||||
Kyber submission; FIPS~203~\cite{fips203} is the standardized form.
|
||||
The ARM NEON and Cortex-M4 implementations are available in
|
||||
pqm4~\cite{pqm4}; cross-ISA comparison is planned for Phase~3.
|
||||
|
||||
\paragraph{PQC benchmarking.}
|
||||
eBACS/SUPERCOP provides a cross-platform benchmark suite~\cite{supercop} that
|
||||
reports median cycle counts for many cryptographic primitives, including Kyber.
|
||||
Our contribution complements this with a statistically rigorous decomposition
|
||||
using nonparametric effect-size analysis and bootstrapped CIs. Kannwischer et
|
||||
al.~\cite{pqm4} present systematic benchmarks on ARM Cortex-M4 (pqm4), which
|
||||
focuses on constrained-device performance rather than SIMD analysis.
|
||||
|
||||
\paragraph{SIMD in cryptography.}
|
||||
Gueron and Krasnov demonstrated AVX2 speedups for AES-GCM~\cite{gueron2014};
|
||||
similar techniques underpin the Kyber AVX2 implementation. Bernstein's
|
||||
vectorized polynomial arithmetic for Curve25519~\cite{bernstein2006} established
|
||||
the template of hand-written vector intrinsics for cryptographic field
|
||||
arithmetic.
|
||||
|
||||
\paragraph{NTT optimization.}
|
||||
Longa and Naehrig~\cite{ntt-survey} survey NTT algorithms for ideal
|
||||
lattice-based cryptography and analyze instruction counts for vectorized
|
||||
implementations. Our measurements provide the first empirical cycle-count
|
||||
decomposition isolating the compiler's contribution vs.\ hand-written SIMD for
|
||||
the ML-KEM NTT specifically.
|
||||
|
||||
\paragraph{Hardware counter profiling.}
|
||||
Bernstein and Schwabe~\cite{cachetime} discuss the relationship between cache
|
||||
behavior and cryptographic timing. PAPI~\cite{papi} provides a portable
|
||||
interface to hardware performance counters used in related profiling work.
|
||||
Phase~2 of this study will add PAPI counter collection to provide the
|
||||
mechanistic hardware-level explanation of the speedups observed here.
|
||||
|
|
@ -0,0 +1,181 @@
|
|||
% ── 4. Results ────────────────────────────────────────────────────────────────
|
||||
\section{Results}
|
||||
\label{sec:results}
|
||||
|
||||
\subsection{Cycle Count Distributions}
|
||||
\label{sec:results:distributions}
|
||||
|
||||
Figure~\ref{fig:distributions} shows the cycle count distributions for three
|
||||
representative operations in \mlkemk{512}, comparing \varref{} and \varavx{}.
|
||||
All distributions are right-skewed with a long tail from OS interrupts and
|
||||
cache-cold executions. The median (dashed lines) is robust to these outliers,
|
||||
justifying the nonparametric approach of §\ref{sec:meth:stats}.
|
||||
|
||||
The separation between \varref{} and \varavx{} is qualitatively different
|
||||
across operation types: for \op{INVNTT} the distributions do not overlap at
|
||||
all (disjoint spikes separated by two orders of magnitude on the log scale);
|
||||
for \op{gen\_a} there is partial overlap; for noise sampling the distributions
|
||||
are nearly coincident.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=\columnwidth]{figures/distributions.pdf}
|
||||
\caption{Cycle count distributions for three representative \mlkemk{512}
|
||||
operations. Log $x$-axis. Dashed lines mark medians. Right-skew and
|
||||
outlier structure motivate nonparametric statistics.}
|
||||
\label{fig:distributions}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Speedup Decomposition}
|
||||
\label{sec:results:decomp}
|
||||
|
||||
Figure~\ref{fig:decomp} shows the cumulative speedup at each optimization stage
|
||||
for all three \mlkem{} parameter sets. Each group of bars represents one
|
||||
operation; the three bars within a group show the total speedup achieved after
|
||||
applying (i)~O3 without auto-vec (\varrefnv{}), (ii)~O3 with auto-vec
|
||||
(\varref{}), and (iii)~hand-written AVX2 (\varavx{})---all normalized to the
|
||||
unoptimized \varrefo{} baseline. The log scale makes the three orders of
|
||||
magnitude of variation legible.
|
||||
|
||||
Several structural features are immediately apparent:
|
||||
\begin{itemize}
|
||||
\item The \varrefnv{} and \varref{} bars are nearly indistinguishable for
|
||||
arithmetic operations (NTT, INVNTT, basemul, frommsg), confirming that
|
||||
GCC's auto-vectorizer contributes negligibly to these operations.
|
||||
\item The \varavx{} bars are 1--2 orders of magnitude taller than the
|
||||
\varref{} bars for arithmetic operations, indicating that hand-written
|
||||
SIMD dominates the speedup.
|
||||
\item For SHAKE-heavy operations (gen\_a, noise), all three bars are much
|
||||
closer together, reflecting the memory-bandwidth bottleneck that limits
|
||||
SIMD benefit.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure*}[t]
|
||||
\centering
|
||||
\input{figures/fig_decomp}
|
||||
\caption{Cumulative speedup at each optimization stage, normalized to
|
||||
\varrefo{} (1×). Three bars per operation:
|
||||
\textcolor{colRefnv}{$\blacksquare$}~O3 no auto-vec,
|
||||
\textcolor{colRef}{$\blacksquare$}~O3 + auto-vec,
|
||||
\textcolor{colAvx}{$\blacksquare$}~O3 + hand SIMD (AVX2).
|
||||
Log $y$-axis; 95\% bootstrap CI shown on \varavx{} bars.
|
||||
Sorted by \varavx{} speedup.}
|
||||
\label{fig:decomp}
|
||||
\end{figure*}
|
||||
|
||||
\subsection{Hand-Written SIMD Speedup}
|
||||
\label{sec:results:simd}
|
||||
|
||||
Figure~\ref{fig:handsimd} isolates the hand-written SIMD speedup (\varref{}
|
||||
$\to$ \varavx{}) across all three \mlkem{} parameter sets. Table~\ref{tab:simd}
|
||||
summarizes the numerical values.
|
||||
|
||||
Key observations:
|
||||
\begin{itemize}
|
||||
\item \textbf{Arithmetic operations} achieve the largest speedups:
|
||||
\speedup{56.3} for \op{INVNTT} at \mlkemk{512}, \speedup{52.0} for
|
||||
\op{basemul}, and \speedup{45.6} for \op{frommsg}. The 95\% bootstrap
|
||||
CIs on these ratios are extremely tight (often $[\hat{s}, \hat{s}]$ to
|
||||
two decimal places), reflecting near-perfect measurement stability.
|
||||
\item \textbf{gen\_a} achieves \speedup{3.8}--\speedup{4.7}: substantially
|
||||
smaller than arithmetic operations because SHAKE-128 generation is
|
||||
memory-bandwidth limited.
|
||||
\item \textbf{Noise sampling} achieves only \speedup{1.2}--\speedup{1.4},
|
||||
the smallest SIMD benefit. The centered binomial distribution (CBD)
|
||||
sampler is bit-manipulation-heavy with sequential bitstream reads that
|
||||
do not parallelise well.
|
||||
\item Speedups are broadly consistent across parameter sets for per-polynomial
|
||||
operations, as expected (§\ref{sec:results:crossparams}).
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure*}[t]
|
||||
\centering
|
||||
\input{figures/fig_hand_simd}
|
||||
\caption{Hand-written SIMD speedup (\varref{} $\to$ \varavx{}) per operation,
|
||||
across all three \mlkem{} parameter sets. Log $y$-axis.
|
||||
95\% bootstrap CI error bars (often sub-pixel).
|
||||
Sorted by \mlkemk{512} speedup.}
|
||||
\label{fig:handsimd}
|
||||
\end{figure*}
|
||||
|
||||
\begin{table}[t]
|
||||
\caption{Hand-written SIMD speedup (\varref{} $\to$ \varavx{}), median ratio
|
||||
with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$.}
|
||||
\label{tab:simd}
|
||||
\small
|
||||
\begin{tabular}{lccc}
|
||||
\toprule
|
||||
Operation & \mlkemk{512} & \mlkemk{768} & \mlkemk{1024} \\
|
||||
\midrule
|
||||
\op{INVNTT} & $56.3\times$ & $52.2\times$ & $50.5\times$ \\
|
||||
\op{basemul} & $52.0\times$ & $47.6\times$ & $41.6\times$ \\
|
||||
\op{frommsg} & $45.6\times$ & $49.2\times$ & $55.4\times$ \\
|
||||
\op{NTT} & $35.5\times$ & $39.4\times$ & $34.6\times$ \\
|
||||
\op{iDec} & $35.1\times$ & $35.0\times$ & $31.1\times$ \\
|
||||
\op{iEnc} & $10.0\times$ & $9.4\times$ & $9.4\times$ \\
|
||||
\op{iKeypair}& $8.3\times$ & $7.6\times$ & $8.1\times$ \\
|
||||
\op{gen\_a} & $4.7\times$ & $3.8\times$ & $4.8\times$ \\
|
||||
\op{noise} & $1.4\times$ & $1.4\times$ & $1.2\times$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{Statistical Significance}
|
||||
\label{sec:results:stats}
|
||||
|
||||
All \varref{} vs.\ \varavx{} comparisons pass the Mann-Whitney U test at
|
||||
$p < 10^{-300}$. Cliff's $\delta = +1.000$ for all operations except
|
||||
\op{NTT} at \mlkemk{512} and \mlkemk{1024} ($\delta = +0.999$), meaning AVX2
|
||||
achieves a strictly smaller cycle count than \varref{} in effectively every
|
||||
observation pair.
|
||||
|
||||
Figure~\ref{fig:cliffs} shows the heatmap of Cliff's $\delta$ values across
|
||||
all operations and parameter sets.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=\columnwidth]{figures/cliffs_delta_heatmap.pdf}
|
||||
\caption{Cliff's $\delta$ (\varref{} vs.\ \varavx{}) for all operations and
|
||||
parameter sets. $\delta = +1$: AVX2 is faster in every observation
|
||||
pair. Nearly all cells are at $+1.000$.}
|
||||
\label{fig:cliffs}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Cross-Parameter Consistency}
|
||||
\label{sec:results:crossparams}
|
||||
|
||||
Figure~\ref{fig:crossparams} shows the \varavx{} speedup for the four
|
||||
per-polynomial operations across \mlkemk{512}, \mlkemk{768}, and
|
||||
\mlkemk{1024}. Since all three instantiations operate on 256-coefficient
|
||||
polynomials, speedups for \op{frommsg} and \op{INVNTT} should be
|
||||
parameter-independent. This holds approximately: frommsg varies by only
|
||||
$\pm{10\%}$, INVNTT by $\pm{6\%}$.
|
||||
|
||||
\op{NTT} shows a more pronounced variation ($35.5\times$ at \mlkemk{512},
|
||||
$39.4\times$ at \mlkemk{768}, $34.6\times$ at \mlkemk{1024}) that is
|
||||
statistically real (non-overlapping 95\% CIs). We attribute this to
|
||||
\emph{cache state effects}: the surrounding polyvec loops that precede each
|
||||
NTT call have a footprint that varies with $k$, leaving different cache
|
||||
residency patterns that affect NTT latency in the scalar \varref{} path.
|
||||
The AVX2 path is less sensitive because its smaller register footprint keeps
|
||||
more state in vector registers.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\input{figures/fig_cross_param}
|
||||
\caption{Per-polynomial operation speedup (\varref{} $\to$ \varavx{}) across
|
||||
security parameters. Polynomial dimension is 256 for all; variation
|
||||
reflects cache-state differences in the calling context.}
|
||||
\label{fig:crossparams}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Hardware Counter Breakdown}
|
||||
\label{sec:results:papi}
|
||||
\phasetwo{IPC, L1/L2/L3 cache miss rates, branch mispredictions via PAPI.
|
||||
This section will contain bar charts of per-counter values comparing ref and
|
||||
avx2 for each operation, explaining the mechanistic origins of the speedup.}
|
||||
|
||||
\subsection{Energy Efficiency}
|
||||
\label{sec:results:energy}
|
||||
\phasetwo{Intel RAPL pkg + DRAM energy readings per operation.
|
||||
EDP (energy-delay product) comparison. Energy per KEM operation.}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
% ── Supplementary: KEM-level end-to-end speedup ───────────────────────────────
|
||||
\section{End-to-End KEM Speedup}
|
||||
\label{sec:supp:kem}
|
||||
|
||||
Figure~\ref{fig:kemlevel} shows the hand-written SIMD speedup for the
|
||||
top-level KEM operations: key generation (\op{kyber\_keypair}), encapsulation
|
||||
(\op{kyber\_encaps}), and decapsulation (\op{kyber\_decaps}). These composite
|
||||
operations aggregate the speedups of their constituent primitives, weighted by
|
||||
relative cycle counts.
|
||||
|
||||
Decapsulation achieves the highest speedup (\speedup{6.9}--\speedup{7.1})
|
||||
because it involves the largest share of arithmetic operations (two additional
|
||||
NTT and INVNTT calls for re-encryption verification). Key generation achieves
|
||||
the lowest (\speedup{5.3}--\speedup{5.9}) because it involves one fewer
|
||||
polynomial multiplication step relative to encapsulation.
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\input{figures/fig_kem_level}
|
||||
\caption{End-to-end KEM speedup (\varref{} $\to$ \varavx{}) for
|
||||
\op{kyber\_keypair}, \op{kyber\_encaps}, and \op{kyber\_decaps}.
|
||||
Intel Xeon Platinum 8268; 95\% bootstrap CI.}
|
||||
\label{fig:kemlevel}
|
||||
\end{figure}
|
||||
|
||||
\section{Full Operation Set}
|
||||
\label{sec:supp:fullops}
|
||||
|
||||
\todo[inline]{Full operation speedup table for all 20 benchmarked operations,
|
||||
including \op{poly\_compress}, \op{poly\_decompress}, \op{polyvec\_compress},
|
||||
\op{poly\_tomsg}, and the \texttt{*\_derand} KEM variants.}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
#!/bin/bash
|
||||
# Build all benchmark binaries on the HPC login node.
|
||||
#
|
||||
# Usage: bash slurm/build.sh [--papi] [--rapl]
|
||||
#
|
||||
# Run this once after rsyncing, before submitting jobs.
|
||||
# Binaries are written to harness/build-hpc/.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
BUILD_DIR="${REPO_ROOT}/harness/build-hpc"
|
||||
WITH_PAPI=OFF
|
||||
WITH_RAPL=OFF
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--papi) WITH_PAPI=ON ;;
|
||||
--rapl) WITH_RAPL=ON ;;
|
||||
*) echo "unknown flag: $arg" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "=== pqc-bench build ==="
|
||||
echo "REPO_ROOT : $REPO_ROOT"
|
||||
echo "BUILD_DIR : $BUILD_DIR"
|
||||
echo "WITH_PAPI : $WITH_PAPI"
|
||||
echo "WITH_RAPL : $WITH_RAPL"
|
||||
echo "CC : ${CC:-default}"
|
||||
echo "DATE : $(date -Iseconds)"
|
||||
|
||||
# Ensure submodule is populated.
|
||||
if [[ ! -f "${REPO_ROOT}/algorithms/kyber/ref/kem.c" ]]; then
|
||||
echo "Populating git submodules..."
|
||||
git -C "$REPO_ROOT" submodule update --init --recursive
|
||||
fi
|
||||
|
||||
cmake \
|
||||
-B "$BUILD_DIR" \
|
||||
-S "${REPO_ROOT}/harness" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DWITH_PAPI="${WITH_PAPI}" \
|
||||
-DWITH_RAPL="${WITH_RAPL}"
|
||||
|
||||
cmake --build "$BUILD_DIR" --parallel
|
||||
|
||||
echo ""
|
||||
echo "Built binaries:"
|
||||
ls -lh "${BUILD_DIR}"/bench_mlkem* 2>/dev/null || echo "(none found)"
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
#!/bin/bash
|
||||
# Instantiate and submit SLURM benchmark jobs.
|
||||
#
|
||||
# Usage: bash slurm/submit.sh [--papi] [--nspins N] [--params LIST] [--variants LIST] [--node NODE]
|
||||
#
|
||||
# Examples:
|
||||
# bash slurm/submit.sh
|
||||
# bash slurm/submit.sh --papi --nspins 500
|
||||
# bash slurm/submit.sh --variants "ref avx2" --params "512 1024"
|
||||
# bash slurm/submit.sh --node node2334 # pin all jobs to a specific node
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
TEMPLATE="${REPO_ROOT}/slurm/templates/bench_mlkem.sh.tmpl"
|
||||
|
||||
# ── Defaults ─────────────────────────────────────────────────────────────────
|
||||
NSPINS=1000
|
||||
WITH_PAPI=OFF
|
||||
PARAMS="512 768 1024"
|
||||
VARIANTS="ref avx2 refnv refo0"
|
||||
BENCH_NODE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--papi) WITH_PAPI=ON ;;
|
||||
--nspins) shift; NSPINS="$1" ;;
|
||||
--params) shift; PARAMS="$1" ;;
|
||||
--variants) shift; VARIANTS="$1" ;;
|
||||
--node) shift; BENCH_NODE="$1" ;;
|
||||
*) echo "unknown flag: $1" >&2; exit 1 ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# Build directory created by build.sh.
|
||||
BUILD_DIR="${REPO_ROOT}/harness/build-hpc"
|
||||
|
||||
if [[ ! -d "$BUILD_DIR" ]]; then
|
||||
echo "ERROR: $BUILD_DIR not found — run slurm/build.sh first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== pqc-bench submit ==="
|
||||
echo "NSPINS : $NSPINS"
|
||||
echo "WITH_PAPI: $WITH_PAPI"
|
||||
echo "PARAMS : $PARAMS"
|
||||
echo "VARIANTS : $VARIANTS"
|
||||
echo "NODE : ${BENCH_NODE:-any}"
|
||||
echo ""
|
||||
|
||||
JOBS_SUBMITTED=0
|
||||
|
||||
for PARAM in $PARAMS; do
|
||||
for VARIANT in $VARIANTS; do
|
||||
BINARY="${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
|
||||
if [[ ! -x "$BINARY" ]]; then
|
||||
echo "SKIP bench_mlkem${PARAM}_${VARIANT} — binary not found"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Output goes into data/raw/kyber/mlkem{PARAM}/{VARIANT}/ so the aggregation
|
||||
# tool infers algorithm and variant from the directory structure.
|
||||
OUTPUT_DIR="${REPO_ROOT}/data/raw/kyber/mlkem${PARAM}/${VARIANT}"
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
# Instantiate template.
|
||||
JOB_SCRIPT="$(mktemp /tmp/bench_mlkem${PARAM}_${VARIANT}.XXXXXX.sh)"
|
||||
export PARAM VARIANT NSPINS BUILD_DIR OUTPUT_DIR WITH_PAPI BENCH_NODE
|
||||
envsubst '${PARAM} ${VARIANT} ${NSPINS} ${BUILD_DIR} ${OUTPUT_DIR} ${WITH_PAPI} ${BENCH_NODE}' \
|
||||
< "$TEMPLATE" > "$JOB_SCRIPT"
|
||||
chmod +x "$JOB_SCRIPT"
|
||||
|
||||
SBATCH_ARGS="--parsable"
|
||||
if [[ -n "$BENCH_NODE" ]]; then
|
||||
SBATCH_ARGS="$SBATCH_ARGS --nodelist=$BENCH_NODE"
|
||||
fi
|
||||
JOB_ID=$(sbatch $SBATCH_ARGS "$JOB_SCRIPT")
|
||||
echo "SUBMIT bench_mlkem${PARAM}_${VARIANT} job=${JOB_ID} out=${OUTPUT_DIR}/${JOB_ID}.out"
|
||||
JOBS_SUBMITTED=$((JOBS_SUBMITTED + 1))
|
||||
done
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Submitted $JOBS_SUBMITTED jobs."
|
||||
|
|
@ -1,38 +1,48 @@
|
|||
#!/bin/bash
|
||||
# Template SLURM job for ML-KEM benchmarking.
|
||||
# Variables filled in by slurm/submit.sh:
|
||||
# PARAM — 512 | 768 | 1024
|
||||
# VARIANT — ref | refnv | avx2 | ...
|
||||
# NTESTS — iterations per operation (default 10000)
|
||||
# BINARY — path to compiled benchmark binary
|
||||
# SLURM job template for ML-KEM benchmarking.
|
||||
# Instantiated by slurm/submit.sh — do not submit directly.
|
||||
#
|
||||
# Template variables (filled by envsubst in submit.sh):
|
||||
# PARAM — 512 | 768 | 1024
|
||||
# VARIANT — ref | avx2 | refnv | refo0
|
||||
# NSPINS — outer loop iterations (default 1000)
|
||||
# BUILD_DIR — path to directory containing the benchmark binaries
|
||||
# OUTPUT_DIR — directory where this job's .out file is written
|
||||
|
||||
#SBATCH -J bench_mlkem${PARAM}_${VARIANT}
|
||||
#SBATCH -p batch
|
||||
#SBATCH -n 1
|
||||
#SBATCH --mem=2G
|
||||
#SBATCH -t 02:00:00
|
||||
#SBATCH --constraint=intel
|
||||
#SBATCH -o %j_mlkem${PARAM}_${VARIANT}.out
|
||||
#SBATCH -c 1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH -t 00:45:00
|
||||
#SBATCH -o ${OUTPUT_DIR}/%j.out
|
||||
|
||||
# Pin to a single core, disable frequency scaling for deterministic measurements.
|
||||
# Requires appropriate OSCAR allocation; skip if unavailable.
|
||||
export GOMP_CPU_AFFINITY="0"
|
||||
# ── Environment ──────────────────────────────────────────────────────────────
|
||||
# Pin to a single logical core for deterministic measurements.
|
||||
taskset -cp 0 $$ 2>/dev/null || true
|
||||
|
||||
NTESTS=${NTESTS:-10000}
|
||||
BINARY=${BINARY:-./bench_mlkem${PARAM}_${VARIANT}}
|
||||
# Disable CPU frequency scaling if we have permission; ignore otherwise.
|
||||
cpupower frequency-set -g performance 2>/dev/null || true
|
||||
|
||||
# ── Metadata (parsed by analysis/pkg/parse) ──────────────────────────────────
|
||||
# These ## lines are picked up by the parser alongside the OSCAR prolog lines.
|
||||
echo "## BENCH_VARIANT : ${VARIANT}"
|
||||
echo "## BENCH_PARAM : ${PARAM}"
|
||||
echo "## BENCH_NSPINS : ${NSPINS}"
|
||||
echo "## BENCH_NODE_REQ : ${BENCH_NODE}"
|
||||
echo "## BENCH_BINARY : ${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
|
||||
echo "## BENCH_DATE : $(date -Iseconds)"
|
||||
echo "## CPU_MODEL : $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)"
|
||||
echo "## PERF_PARANOID : $(cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo unknown)"
|
||||
echo "## PAPI_BUILD : ${WITH_PAPI:-OFF}"
|
||||
|
||||
BINARY="${BUILD_DIR}/bench_mlkem${PARAM}_${VARIANT}"
|
||||
NSPINS="${NSPINS:-1000}"
|
||||
|
||||
if [[ ! -x "$BINARY" ]]; then
|
||||
echo "ERROR: binary not found or not executable: $BINARY" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== bench_mlkem${PARAM}_${VARIANT} ==="
|
||||
echo "SLURM_JOB_ID: $SLURM_JOB_ID"
|
||||
echo "SLURM_NODELIST: $SLURM_NODELIST"
|
||||
echo "NTESTS: $NTESTS"
|
||||
echo "DATE: $(date -Iseconds)"
|
||||
echo "UNAME: $(uname -a)"
|
||||
echo "CPU: $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)"
|
||||
echo "---"
|
||||
|
||||
"$BINARY" "$NTESTS"
|
||||
# ── Run ───────────────────────────────────────────────────────────────────────
|
||||
"$BINARY" "$NSPINS"
|
||||
|
|
|
|||
Loading…
Reference in New Issue