initial move

2026-03-29 08:05:12 -04:00 · 2026-03-29 08:05:12 -04:00 · 7750ae3d8c
parent 719e611e39
commit 7750ae3d8c
38 changed files with 369 additions and 63049 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,40 @@
+# Build outputs
+harness/build/
+results/
+
+# Compiled binaries (analysis tools, when written)
+analysis/target/
+analysis/build/
+*.o
+*.a
+*.so
+
+# Algorithm submodule build artifacts
+algorithms/*/build/
+algorithms/*/Makefile
+algorithms/**/*.o
+
+# Generated data (re-runnable)
+data/objdump/
+
+# LaTeX build artifacts
+paper/*.aux
+paper/*.log
+paper/*.out
+paper/*.toc
+paper/*.bbl
+paper/*.blg
+paper/*.fls
+paper/*.fdb_latexmk
+paper/*.synctex.gz
+paper/*.pdf
+
+# Personal reference docs
+ROADMAP.md
+IDEATION.md
+
+# OS / editor
+.DS_Store
+.idea/
+*.swp
+*~
--- a/README.md
+++ b/README.md
@ -1,2 +1,15 @@
-# cs1952y-final
-Profiling Kyber: what are the implications of SIMD-optimized cryptography on performance and energy?
+# pqc-simd-bench
+
+Empirical study of SIMD optimization in post-quantum cryptography — performance
+and energy analysis across algorithm variants and security parameter sets.
+
+```
+algorithms/     PQC algorithm submodules (ML-KEM, ML-DSA, ...)
+harness/        Benchmark harness (C, CMake)
+slurm/          SLURM job templates for OSCAR
+data/raw/       Raw benchmark output, organized by algorithm and parameter set
+results/        Processed outputs and figures (generated, gitignored)
+paper/          LaTeX write-up
+```
+
+**Author:** Levi Neuwirth — [ln@levineuwirth.org](mailto:ln@levineuwirth.org)
--- a/algorithms/.gitkeep
+++ b/algorithms/.gitkeep
--- a/benchmark/objdump/ref1024.txt
+++ b/benchmark/objdump/ref1024.txt
--- a/benchmark/objdump/ref512.txt
+++ b/benchmark/objdump/ref512.txt
--- a/benchmark/objdump/ref768.txt
+++ b/benchmark/objdump/ref768.txt
--- a/benchmark/objdump/refnv0768.txt
+++ b/benchmark/objdump/refnv0768.txt
--- a/benchmark/objdump/refnv1024.txt
+++ b/benchmark/objdump/refnv1024.txt
--- a/benchmark/objdump/refnv512.txt
+++ b/benchmark/objdump/refnv512.txt
--- a/benchmark/objdump/refnv768.txt
+++ b/benchmark/objdump/refnv768.txt
--- a/benchmark/objdump/refo01024.txt
+++ b/benchmark/objdump/refo01024.txt
--- a/benchmark/objdump/refo0512.txt
+++ b/benchmark/objdump/refo0512.txt
--- a/benchmark/objdump/refo0768.txt
+++ b/benchmark/objdump/refo0768.txt
--- a/benchmark/test_1024.sh
+++ b/benchmark/test_1024.sh
@ -1,18 +0,0 @@
-#!/bin/sh
-# TODO: change me!
-# -p: which partition do you want to run your workload on? <batch, gpu, bigmem>
-# -n: how many CPU cores do you want to run your job?
-# --mem: how much memory do you want?
-# -t: how long do you want to run the job before it timesout <hh:mm:ss>
-# --constraint=intel: required for power monitoring
-
-#SBATCH -p batch
-#SBATCH -n 1
-#SBATCH --mem=1g
-#SBATCH -t 60:00
-#SBATCH --constraint=intel
-for i in {1..1000}
-	do
-		echo "Loop spin:" $i
-		./test_speed1024
-	done
--- a/benchmark/test_512.sh
+++ b/benchmark/test_512.sh
@ -1,18 +0,0 @@
-#!/bin/sh
-# TODO: change me!
-# -p: which partition do you want to run your workload on? <batch, gpu, bigmem>
-# -n: how many CPU cores do you want to run your job?
-# --mem: how much memory do you want?
-# -t: how long do you want to run the job before it timesout <hh:mm:ss>
-# --constraint=intel: required for power monitoring
-
-#SBATCH -p batch
-#SBATCH -n 1
-#SBATCH --mem=1g
-#SBATCH -t 60:00
-#SBATCH --constraint=intel
-for i in {1..1000}
-	do
-		echo "Loop spin:" $i
-		./test_speed512
-	done
--- a/benchmark/test_768.sh
+++ b/benchmark/test_768.sh
@ -1,18 +0,0 @@
-#!/bin/sh
-# TODO: change me!
-# -p: which partition do you want to run your workload on? <batch, gpu, bigmem>
-# -n: how many CPU cores do you want to run your job?
-# --mem: how much memory do you want?
-# -t: how long do you want to run the job before it timesout <hh:mm:ss>
-# --constraint=intel: required for power monitoring
-
-#SBATCH -p batch
-#SBATCH -n 1
-#SBATCH --mem=1g
-#SBATCH -t 60:00
-#SBATCH --constraint=intel
-for i in {1..1000}
-	do
-		echo "Loop spin:" $i
-		./test_speed768
-	done
--- a/benchmark/slurm/slurm-11233231.out
+++ b/benchmark/slurm/slurm-11233231.out
--- a/benchmark/slurm/slurm-11233308.out
+++ b/benchmark/slurm/slurm-11233308.out
--- a/benchmark/slurm/slurm-11233322.out
+++ b/benchmark/slurm/slurm-11233322.out
--- a/benchmark/slurm/slurm-11284569.out
+++ b/benchmark/slurm/slurm-11284569.out
--- a/benchmark/slurm/slurm-11233228.out
+++ b/benchmark/slurm/slurm-11233228.out
--- a/benchmark/slurm/slurm-11233306.out
+++ b/benchmark/slurm/slurm-11233306.out
--- a/benchmark/slurm/slurm-11233320.out
+++ b/benchmark/slurm/slurm-11233320.out
--- a/benchmark/slurm/slurm-11284566.out
+++ b/benchmark/slurm/slurm-11284566.out
--- a/benchmark/slurm/slurm-11233230.out
+++ b/benchmark/slurm/slurm-11233230.out
--- a/benchmark/slurm/slurm-11233307.out
+++ b/benchmark/slurm/slurm-11233307.out
--- a/benchmark/slurm/slurm-11233321.out
+++ b/benchmark/slurm/slurm-11233321.out
--- a/benchmark/slurm/slurm-11284568.out
+++ b/benchmark/slurm/slurm-11284568.out
--- a/harness/CMakeLists.txt
+++ b/harness/CMakeLists.txt
@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.20)
+project(pqc-bench C)
+
+set(CMAKE_C_STANDARD 11)
+
+# ── Compiler flags ──────────────────────────────────────────────────────────
+# Release build with full optimization; override on the command line:
+#   cmake -DCMAKE_BUILD_TYPE=Debug ..
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+set(CMAKE_C_FLAGS_RELEASE "-O3 -march=native")
+
+# ── Algorithm root (submodule) ───────────────────────────────────────────────
+# Each target below compiles a variant of test_speed.c against a specific
+# algorithm build.  Add algorithm libraries as submodule CMake subdirectories
+# or via add_library() here as the project grows.
+#
+# Example layout once kyber submodule is added:
+#   algorithms/kyber/ref/   → static lib kyber512_ref, kyber768_ref, kyber1024_ref
+#   algorithms/kyber/avx2/  → static lib kyber512_avx2, ...
+
+# ── Harness source ───────────────────────────────────────────────────────────
+set(HARNESS_SRC src/test_speed.c)
+
+# ── Build variants ───────────────────────────────────────────────────────────
+# Uncomment and adjust as algorithm libraries become available.
+#
+# foreach(PARAM 512 768 1024)
+#   foreach(VARIANT ref refnv)
+#     set(TARGET "bench_mlkem${PARAM}_${VARIANT}")
+#     add_executable(${TARGET} ${HARNESS_SRC})
+#     target_include_directories(${TARGET} PRIVATE
+#       ${CMAKE_SOURCE_DIR}/../algorithms/kyber/${VARIANT})
+#     target_link_libraries(${TARGET} kyber${PARAM}_${VARIANT})
+#     target_compile_definitions(${TARGET} PRIVATE KYBER_K=${PARAM})
+#   endforeach()
+# endforeach()
+
+# ── PAPI (hardware performance counters) ─────────────────────────────────────
+# Optional; enable with -DWITH_PAPI=ON
+option(WITH_PAPI "Link against PAPI for hardware counter collection" OFF)
+if(WITH_PAPI)
+  find_library(PAPI_LIB papi REQUIRED)
+  find_path(PAPI_INCLUDE papi.h REQUIRED)
+  # Targets that need PAPI:
+  # target_include_directories(<target> PRIVATE ${PAPI_INCLUDE})
+  # target_link_libraries(<target> ${PAPI_LIB})
+endif()
+
+# ── RAPL energy measurement ──────────────────────────────────────────────────
+# Optional; enable with -DWITH_RAPL=ON (requires root or CAP_SYS_RAWIO)
+option(WITH_RAPL "Enable Intel RAPL energy measurement" OFF)
+if(WITH_RAPL)
+  # target_compile_definitions(<target> PRIVATE WITH_RAPL)
+endif()
--- a/harness/src/test_speed.c
+++ b/harness/src/test_speed.c
--- a/paper/figures/.gitkeep
+++ b/paper/figures/.gitkeep
--- a/paper/main.tex
+++ b/paper/main.tex
@ -0,0 +1,120 @@
+\documentclass[sigconf, nonacm]{acmart}
+
+\usepackage{booktabs}
+\usepackage{microtype}
+\usepackage{pgfplots}
+\pgfplotsset{compat=1.18}
+
+% ── Metadata (fill in when ready) ────────────────────────────────────────────
+\title{SIMD Optimization in Post-Quantum Cryptography:\\
+       A Micro-Architecture and Energy Analysis}
+
+\author{Levi Neuwirth}
+\affiliation{%
+  \institution{Brown University}
+  \city{Providence}
+  \state{Rhode Island}
+  \country{USA}
+}
+\email{ln@levineuwirth.org}
+
+\begin{abstract}
+TODO
+\end{abstract}
+
+\keywords{post-quantum cryptography, ML-KEM, Kyber, SIMD, AVX2, performance
+          analysis, energy efficiency, micro-architecture}
+
+% ─────────────────────────────────────────────────────────────────────────────
+\begin{document}
+\maketitle
+
+% ── 1. Introduction ──────────────────────────────────────────────────────────
+\section{Introduction}
+\label{sec:intro}
+
+TODO
+
+% ── 2. Background ────────────────────────────────────────────────────────────
+\section{Background}
+\label{sec:background}
+
+\subsection{ML-KEM / Kyber}
+TODO: Module-LWE, ring structure, NTT.
+
+\subsection{SIMD on x86-64}
+TODO: AVX2 register model, relevant instructions for polynomial arithmetic.
+
+\subsection{Hardware Performance Counters and RAPL}
+TODO: perf, PAPI, Intel RAPL energy domains.
+
+% ── 3. Methodology ───────────────────────────────────────────────────────────
+\section{Methodology}
+\label{sec:methodology}
+
+\subsection{Implementation Variants}
+TODO: ref (AVX2 intrinsics), refnv (scalar, no vectorization), refo0 (unoptimized
+baseline).
+
+\subsection{Benchmark Harness}
+TODO: cycle counter, iteration count, statistical methodology, OSCAR node spec.
+
+\subsection{Hardware Counter Collection}
+TODO: PAPI events selected and why.
+
+\subsection{Energy Measurement}
+TODO: RAPL pkg + DRAM domains, joules-per-operation derivation.
+
+% ── 4. Results ───────────────────────────────────────────────────────────────
+\section{Results}
+\label{sec:results}
+
+\subsection{Cycle Counts}
+
+\begin{table}[h]
+\caption{Median cycle counts, ML-KEM-512, 10\,000 iterations.}
+\label{tab:cycles512}
+\begin{tabular}{lrrr}
+\toprule
+Operation & ref (AVX2) & refnv (scalar) & speedup \\
+\midrule
+NTT                         & TODO & TODO & TODO$\times$ \\
+INVNTT                      & TODO & TODO & TODO$\times$ \\
+polyvec\_basemul\_acc       & TODO & TODO & TODO$\times$ \\
+indcpa\_keypair             & TODO & TODO & TODO$\times$ \\
+indcpa\_enc                 & TODO & TODO & TODO$\times$ \\
+kyber\_encaps               & TODO & TODO & TODO$\times$ \\
+kyber\_decaps               & TODO & TODO & TODO$\times$ \\
+\bottomrule
+\end{tabular}
+\end{table}
+
+\subsection{Hardware Counter Breakdown}
+TODO: IPC, cache miss rates, branch mispredictions.
+
+\subsection{Energy Efficiency}
+TODO: joules/operation, EDP comparison.
+
+% ── 5. Discussion ────────────────────────────────────────────────────────────
+\section{Discussion}
+\label{sec:discussion}
+
+TODO: mechanistic explanation of where the speedup comes from.
+
+% ── 6. Related Work ──────────────────────────────────────────────────────────
+\section{Related Work}
+\label{sec:related}
+
+TODO
+
+% ── 7. Conclusion ────────────────────────────────────────────────────────────
+\section{Conclusion}
+\label{sec:conclusion}
+
+TODO
+
+% ── References ───────────────────────────────────────────────────────────────
+\bibliographystyle{ACM-Reference-Format}
+\bibliography{refs}
+
+\end{document}
--- a/paper/refs.bib
+++ b/paper/refs.bib
@ -0,0 +1,99 @@
+% ── Post-Quantum Cryptography Standards ──────────────────────────────────────
+
+@techreport{fips203,
+  author      = {{National Institute of Standards and Technology}},
+  title       = {{Module-Lattice-Based Key-Encapsulation Mechanism Standard}},
+  institution = {NIST},
+  year        = {2024},
+  number      = {FIPS 203},
+  url         = {https://doi.org/10.6028/NIST.FIPS.203},
+}
+
+@techreport{fips204,
+  author      = {{National Institute of Standards and Technology}},
+  title       = {{Module-Lattice-Based Digital Signature Standard}},
+  institution = {NIST},
+  year        = {2024},
+  number      = {FIPS 204},
+  url         = {https://doi.org/10.6028/NIST.FIPS.204},
+}
+
+@techreport{fips205,
+  author      = {{National Institute of Standards and Technology}},
+  title       = {{Stateless Hash-Based Digital Signature Standard}},
+  institution = {NIST},
+  year        = {2024},
+  number      = {FIPS 205},
+  url         = {https://doi.org/10.6028/NIST.FIPS.205},
+}
+
+% ── Kyber / ML-KEM ───────────────────────────────────────────────────────────
+
+@inproceedings{kyber2018,
+  author    = {Bos, Joppe W. and Ducas, Léo and Kiltz, Eike and Lepoint, Tancrède
+               and Lyubashevsky, Vadim and Schanck, John M. and Schwabe, Peter
+               and Seiler, Gregor and Stehlé, Damien},
+  title     = {{CRYSTALS -- Kyber: A CCA-Secure Module-Lattice-Based KEM}},
+  booktitle = {IEEE European Symposium on Security and Privacy (EuroS\&P)},
+  year      = {2018},
+  pages     = {353--367},
+  doi       = {10.1109/EuroSP.2018.00032},
+}
+
+@misc{kyber-avx2,
+  author    = {Schwabe, Peter and Seiler, Gregor},
+  title     = {{Better Bootstrapping in Fully Homomorphic Encryption}},
+  note      = {AVX2 implementation in the pqclean project},
+  url       = {https://github.com/pq-crystals/kyber},
+}
+
+% ── SIMD and Microarchitecture ────────────────────────────────────────────────
+
+@inproceedings{intel-avx2,
+  author    = {{Intel Corporation}},
+  title     = {{Intel 64 and IA-32 Architectures Software Developer's Manual}},
+  year      = {2024},
+  note      = {Volume 2: Instruction Set Reference},
+}
+
+@inproceedings{ntt-survey,
+  author    = {Longa, Patrick and Naehrig, Michael},
+  title     = {{Speeding Up the Number Theoretic Transform for Faster Ideal
+                Lattice-Based Cryptography}},
+  booktitle = {CANS},
+  year      = {2016},
+  doi       = {10.1007/978-3-319-48965-0_8},
+}
+
+% ── Energy Measurement ───────────────────────────────────────────────────────
+
+@inproceedings{rapl,
+  author    = {David, Howard and Gorbatov, Eugene and Hanebutte, Ulf R. and
+               Khanna, Rahul and Le, Christian},
+  title     = {{RAPL: Memory Power Estimation and Capping}},
+  booktitle = {ISLPED},
+  year      = {2010},
+  doi       = {10.1145/1840845.1840883},
+}
+
+% ── Related Benchmarking Work ────────────────────────────────────────────────
+
+@misc{pqclean,
+  author    = {{PQClean Contributors}},
+  title     = {{PQClean: Clean, portable, tested implementations of post-quantum
+                cryptography}},
+  url       = {https://github.com/PQClean/PQClean},
+}
+
+@misc{liboqs,
+  author    = {{Open Quantum Safe Project}},
+  title     = {{liboqs: C library for quantum-safe cryptographic algorithms}},
+  url       = {https://github.com/open-quantum-safe/liboqs},
+}
+
+@misc{pqm4,
+  author    = {Kannwischer, Matthias J. and Rijneveld, Joost and Schwabe, Peter
+               and Stoffelen, Ko},
+  title     = {{pqm4: Post-quantum crypto library for the ARM Cortex-M4}},
+  url       = {https://github.com/mupq/pqm4},
+}
--- a/slurm/templates/bench_mlkem.sh.tmpl
+++ b/slurm/templates/bench_mlkem.sh.tmpl
@ -0,0 +1,38 @@
+#!/bin/bash
+# Template SLURM job for ML-KEM benchmarking.
+# Variables filled in by slurm/submit.sh:
+#   PARAM    — 512 | 768 | 1024
+#   VARIANT  — ref | refnv | avx2 | ...
+#   NTESTS   — iterations per operation (default 10000)
+#   BINARY   — path to compiled benchmark binary
+
+#SBATCH -J bench_mlkem${PARAM}_${VARIANT}
+#SBATCH -p batch
+#SBATCH -n 1
+#SBATCH --mem=2G
+#SBATCH -t 02:00:00
+#SBATCH --constraint=intel
+#SBATCH -o %j_mlkem${PARAM}_${VARIANT}.out
+
+# Pin to a single core, disable frequency scaling for deterministic measurements.
+# Requires appropriate OSCAR allocation; skip if unavailable.
+export GOMP_CPU_AFFINITY="0"
+
+NTESTS=${NTESTS:-10000}
+BINARY=${BINARY:-./bench_mlkem${PARAM}_${VARIANT}}
+
+if [[ ! -x "$BINARY" ]]; then
+  echo "ERROR: binary not found or not executable: $BINARY" >&2
+  exit 1
+fi
+
+echo "=== bench_mlkem${PARAM}_${VARIANT} ==="
+echo "SLURM_JOB_ID:  $SLURM_JOB_ID"
+echo "SLURM_NODELIST: $SLURM_NODELIST"
+echo "NTESTS:        $NTESTS"
+echo "DATE:          $(date -Iseconds)"
+echo "UNAME:         $(uname -a)"
+echo "CPU:           $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)"
+echo "---"
+
+"$BINARY" "$NTESTS"
--- a/util/analyze_simd
+++ b/util/analyze_simd
--- a/util/analyze_simd.go
+++ b/util/analyze_simd.go
@ -1,178 +0,0 @@
-/*
- *
- * FILE: analyze_simd.go
- * LATEST: 10:08 05 May 2025
- * DESC: find percentage of a dumped amd64 object file's instructions that are SIMD instructions
- * AUTHOR: Levi Neuwirth <ln@levineuwirth.org>
- *
- */
-
-package main
-
-import (
-	"bufio"
-	"fmt"
-	"log"
-	"os"
-	"regexp"
-	"strings"
-)
-
-var total int
-var simd int
-
-// Since Go doesn't have a hashset, we will use a hashmap and ignore the Value...
-var simdInstr map[string]bool
-var digits []string
-
-func main() {
-	if len(os.Args) < 1 {
-		log.Fatal("Usage: ./analyze_simd <path to .txt from objdump>")
-	}
-
-	objDumpRaw, err := os.Open(os.Args[1])
-	if err != nil {
-		log.Fatal(err)
-	} else {
-		fmt.Println("Successfully opened object dump. Investigating...")
-	}
-	defer objDumpRaw.Close()
-
-	initDigits()
-	initSimdInstructions()
-
-	// This regex magic will get us the instructions from an extracted objdump line.
-	instrRegex := regexp.MustCompile(`\b([a-z]{2,6}[a-z]*)\b`)
-
-	scanner := bufio.NewScanner(objDumpRaw)
-	for scanner.Scan() {
-		localLine := scanner.Text()
-		localLineSplit := strings.Fields(localLine)
-
-		if len(localLineSplit) < 2 || !strings.Contains(localLineSplit[0], ":") {
-			continue
-		}
-
-		matches := instrRegex.FindAllString(localLine, -1)
-		if len(matches) == 0 {
-			continue
-		}
-
-		instr := matches[0]
-		log.Println(instr)
-		if simdInstr[instr] {
-			simd++
-		}
-		total++
-	}
-
-	if err := scanner.Err(); err != nil {
-		log.Fatal(err)
-	}
-
-	fmt.Printf("The result is:\n%d SIMD instructions\n%d Total instructions\n", simd, total)
-}
-
-func initSimdInstructions() {
-	simdInstr = map[string]bool{
-		// MMX Instructions
-		"packsswb": true, "packssdw": true,
-		"packuswb": true, "paddb": true, "paddw": true, "paddd": true,
-		"paddsb": true, "paddsw": true, "paddusb": true, "paddusw": true,
-		"pand": true, "pandn": true, "pcmpeqb": true, "pcmpeqw": true,
-		"pcmpeqd": true, "pcmpgtb": true, "pcmpgtw": true, "pcmpgtd": true,
-		"pmaddwd": true, "pmulhw": true, "pmullw": true, "por": true,
-		"psllw": true, "pslld": true, "psllq": true, "psraw": true,
-		"psrad": true, "psrlw": true, "psrld": true, "psrlq": true,
-		"psubb": true, "psubw": true, "psubd": true, "psubsb": true,
-		"psubsw": true, "psubusb": true, "psubusw": true, "punpckhbw": true,
-		"punpckhwd": true, "punpckhdq": true, "punpcklbw": true, "punpcklwd": true,
-		"punpckldq": true, "pxor": true,
-
-		// SSE Instructions
-		"addps": true, "addss": true, "andps": true, "andnps": true,
-		"cmpeqps": true, "cmpeqss": true, "cmpgeps": true, "cmpgess": true,
-		"cmpgtps": true, "cmpgtss": true, "cmpleps": true, "cmpless": true,
-		"cmpltps": true, "cmpltss": true, "cmpneqps": true, "cmpneqss": true,
-		"cmpngeps": true, "cmpngess": true, "cmpngtps": true, "cmpngtss": true,
-		"cmpnleps": true, "cmpnless": true, "cmpnltps": true, "cmpnltss": true,
-		"cmpordps": true, "cmpordss": true, "cmpunordps": true, "cmpunordss": true,
-		"divps": true, "divss": true, "maxps": true, "maxss": true,
-		"minps": true, "minss": true, "movaps": true, "movss": true,
-		"movups": true, "mulps": true, "mulss": true, "rcpps": true,
-		"rcpss": true, "rsqrtps": true, "rsqrtss": true, "sqrtps": true,
-		"sqrtss": true, "subps": true, "subss": true, "xorps": true,
-
-		// SSE2 Instructions
-		"addpd": true, "addsd": true, "andpd": true, "andnpd": true,
-		"cmpeqpd": true, "cmpeqsd": true, "cmpgepd": true, "cmpgesd": true,
-		"cmpgtpd": true, "cmpgtsd": true, "cmplepd": true, "cmplesd": true,
-		"cmpltpd": true, "cmpltsd": true, "cmpneqpd": true, "cmpneqsd": true,
-		"cmpngepd": true, "cmpngesd": true, "cmpngtpd": true, "cmpngtsd": true,
-		"cmpnlepd": true, "cmpnlesd": true, "cmpnltpd": true, "cmpnltsd": true,
-		"cmpordpd": true, "cmpordsd": true, "cmpunordpd": true, "cmpunordsd": true,
-		"divpd": true, "divsd": true, "maxpd": true, "maxsd": true,
-		"minpd": true, "minsd": true, "movapd": true, "movsd": true,
-		"movupd": true, "mulpd": true, "mulsd": true, "sqrtpd": true,
-		"subpd": true, "subsd": true, "xorpd": true,
-
-		// SSE3 Instructions
-		"addsubpd": true, "addsubps": true, "haddpd": true, "haddps": true,
-		"hsubpd": true, "hsubps": true, "lddqu": true, "monitor": true,
-		"mwait": true, "movddup": true, "movshdup": true, "movsldup": true,
-
-		// SSSE3 Instructions
-		"pshufb": true, "phaddw": true, "phaddd": true, "phaddsw": true,
-		"pmaddubsw": true, "phsubw": true, "phsubd": true, "phsubsw": true,
-		"psignb": true, "psignw": true, "psignd": true, "pmulhrsw": true,
-		"palignr": true,
-
-		// SSE4.1 Instructions
-		"blendpd": true, "blendps": true, "blendvpd": true, "blendvps": true,
-		"dppd": true, "dpps": true, "extractps": true, "insertps": true,
-		"movntdqa": true, "mpsadbw": true, "packusdw": true, "pblendvb": true,
-		"pblendw": true, "pcmpeqq": true, "pextrb": true, "pextrd": true,
-		"pextrq": true, "phminposuw": true, "pinsrb": true, "pinsrd": true,
-		"pinsrq": true, "pmuldq": true, "pmulld": true, "ptest": true,
-		"roundpd": true, "roundps": true, "roundsd": true, "roundss": true,
-
-		// SSE4.2 Instructions
-		"pcmpestri": true, "pcmpestrm": true, "pcmpistri": true, "pcmpistrm": true,
-		"crc32": true, "popcnt": true,
-
-		// AVX Instructions
-		"vaddpd": true, "vaddps": true, "vaddsd": true, "vaddss": true,
-		"vandpd": true, "vandps": true, "vandnpd": true, "vandnps": true,
-		"vdivpd": true, "vdivps": true, "vdivsd": true, "vdivss": true,
-		"vmaxpd": true, "vmaxps": true, "vmaxsd": true, "vmaxss": true,
-		"vminpd": true, "vminps": true, "vminsd": true, "vminss": true,
-		"vmulpd": true, "vmulps": true, "vmulsd": true, "vmulss": true,
-		"vorpd": true, "vorps": true, "vsqrtpd": true, "vsqrtps": true,
-		"vsqrtsd": true, "vsqrtss": true, "vsubpd": true, "vsubps": true,
-		"vsubsd": true, "vsubss": true, "vxorpd": true, "vxorps": true,
-
-		// AVX2 Instructions
-		"vpabsb": true, "vpabsw": true, "vpabsd": true, "vpaddb": true,
-		"vpaddw": true, "vpaddd": true, "vpaddq": true, "vpaddsb": true,
-		"vpaddsw": true, "vpaddusb": true, "vpaddusw": true, "vpalignr": true,
-		"vpand": true, "vpandn": true, "vpavgb": true, "vpavgw": true,
-		"vpblendd": true, "vpcmpeqb": true, "vpcmpeqw": true, "vpcmpeqd": true,
-		"vpcmpeqq": true, "vpcmpgtb": true, "vpcmpgtw": true, "vpcmpgtd": true,
-
-		// AVX512 not included since Kyber does not use it.
-	}
-}
-
-func initDigits() {
-	digits = make([]string, 0)
-	digits = append(digits, "0")
-	digits = append(digits, "1")
-	digits = append(digits, "2")
-	digits = append(digits, "3")
-	digits = append(digits, "4")
-	digits = append(digits, "5")
-	digits = append(digits, "6")
-	digits = append(digits, "7")
-	digits = append(digits, "8")
-	digits = append(digits, "9")
-}
--- a/util/testrun_sum_std
+++ b/util/testrun_sum_std
--- a/util/testrun_sum_std.go
+++ b/util/testrun_sum_std.go
@ -1,137 +0,0 @@
-/*
- *
- * FILE: testrun_sum_std.go
- * LATEST: 10:19 05 May 2025
- * DESC: sum values from iterative Kyber batch jobs.
- * AUTHOR: Levi Neuwirth <ln@levineuwirth.org>
- *
- */
-
-package main
-
-import (
-	"bufio"
-	"fmt"
-	"log"
-	"math"
-	"os"
-	"strconv"
-	"strings"
-)
-
-var count float64
-var testSums map[string]float64
-var lastTest string
-var gen_a []float64
-var indcpa_keypair []float64
-var indcpa_enc []float64
-var keypair_derand []float64
-var keypair []float64
-var encaps []float64
-var decaps []float64
-
-func main() {
-	if len(os.Args) < 1 {
-		log.Fatal("Usage: ./testrun_sum_std <path to slurm.OUT file>")
-	}
-
-	outRaw, err := os.Open(os.Args[1])
-	if err != nil {
-		log.Fatal(err)
-	} else {
-		fmt.Println("Successfully opened slurm STDOUT")
-	}
-	defer outRaw.Close()
-	initTestSums()
-	count = 0
-	lastTest = "none"
-	scanner := bufio.NewScanner(outRaw)
-	for scanner.Scan() {
-		localLine := scanner.Text()
-		// Check if we've hit a new test iteration
-		if strings.Contains(localLine, "Loop spin:") {
-			count += 1
-			continue
-			// Otherwise, we might have data from a previously indicated test.
-		} else if strings.Contains(localLine, "average:") {
-			// We split the line and add to the appropriate testSums index.
-			line := localLine[9:]
-			var numberStr strings.Builder
-			for _, ch := range line {
-				if (ch >= '0' && ch <= '9') || ch == '.' {
-					numberStr.WriteRune(ch)
-				} else {
-					break
-				}
-			}
-
-			add, err := strconv.ParseFloat(numberStr.String(), 64)
-			if err != nil {
-				log.Printf("Failed to parse number from line %q: %v", localLine, err)
-				continue
-			}
-			testSums[lastTest] += add
-			// And now for the stddev:
-			switch lastTest {
-			case "gen_a:":
-				gen_a = append(gen_a, add)
-			case "indcpa_keypair:":
-				indcpa_keypair = append(indcpa_keypair, add)
-			case "indcpa_enc:":
-				indcpa_enc = append(indcpa_enc, add)
-			case "kyber_keypair_derand:":
-				keypair_derand = append(keypair_derand, add)
-			case "kyber_keypair:":
-				keypair = append(keypair, add)
-			case "kyber_encaps:":
-				encaps = append(encaps, add)
-			case "kyber_decaps:":
-				decaps = append(decaps, add)
-			default:
-				continue
-			}
-			continue
-			// We aren't concerned with the medians here.
-		} else if strings.Contains(localLine, "median:") {
-			continue
-		}
-
-		// Here, figure out what the test was for the next data.
-		trimmed := strings.TrimSpace(localLine)
-		if strings.HasSuffix(trimmed, ":") && !strings.Contains(trimmed, "average") && !strings.Contains(trimmed, "median") {
-			lastTest = trimmed
-			continue
-		}
-
-	}
-
-	// Now we take the averages and stddevs.
-	fmt.Printf("gen_a avg: %f\ngen_a stddev: %f\n", testSums["gen_a:"]/count, calcStddev("gen_a:", gen_a))
-	fmt.Printf("indcpa keypair avg: %f\nindcpa_keypair stddev: %f\n", testSums["indcpa_keypair:"]/count, calcStddev("indcpa_keypair:", indcpa_keypair))
-	fmt.Printf("indcpa enc avg: %f\nindcpa_enc stddev: %f\n", testSums["indcpa_enc:"]/count, calcStddev("indcpa_enc:", indcpa_enc))
-	fmt.Printf("keypair_derand avg: %f\nkeypair_derand stddev:: %f\n", testSums["kyber_keypair_derand:"]/count, calcStddev("kyber_keypair_derand:", keypair_derand))
-	fmt.Printf("keypair avg: %f\nkeypair stddev:: %f\n", testSums["kyber_keypair:"]/count, calcStddev("kyber_keypair:", keypair))
-	fmt.Printf("encaps avg: %f\nencaps stddev:: %f\n", testSums["kyber_encaps:"]/count, calcStddev("kyber_encaps:", encaps))
-	fmt.Printf("decaps avg: %f\ndecaps stddev:: %f\n", testSums["kyber_decaps:"]/count, calcStddev("kyber_decaps:", decaps))
-}
-
-func initTestSums() {
-	testSums = make(map[string]float64)
-	testSums["gen_a:"] = 0
-	testSums["indcpa_keypair:"] = 0
-	testSums["indcpa_enc:"] = 0
-	testSums["kyber_keypair_derand:"] = 0
-	testSums["kyber_keypair:"] = 0
-	testSums["kyber_encaps:"] = 0
-	testSums["kyber_decaps:"] = 0
-}
-
-func calcStddev(test string, inputs []float64) (result float64) {
-	mean := float64(testSums[test] / float64(len(inputs)))
-	var variance float64
-	for _, value := range inputs {
-		variance += (value - mean) * (value - mean)
-	}
-
-	return math.Sqrt(variance / float64(len(inputs)))
-}