\relax \providecommand\hyper@newdestlabel[2]{} \providecommand\HyField@AuxAddToFields[1]{} \providecommand\HyField@AuxAddToCoFields[2]{} \citation{fips203,fips204,fips205} \citation{bettini2024} \citation{kyber-avx2} \citation{fips203} \citation{ntt-survey} \@writefile{toc}{\contentsline {section}{Abstract}{1}{section*.1}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent } \newlabel{sec:intro}{{1}{1}{Introduction}{section.1}{}} \@writefile{toc}{\contentsline {section}{\numberline {2}Background}{1}{section.2}\protected@file@percent } \newlabel{sec:background}{{2}{1}{Background}{section.2}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.1}ML-KEM and the Number Theoretic Transform}{1}{subsection.2.1}\protected@file@percent } \citation{kyber-avx2} \citation{papi} \citation{rapl} \citation{kyber-avx2} \@writefile{toc}{\contentsline {subsection}{\numberline {2.2}AVX2 SIMD on x86-64}{2}{subsection.2.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Compilation Variants}{2}{subsection.2.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Hardware Performance Counters and Energy}{2}{subsection.2.4}\protected@file@percent } \newlabel{sec:bg:papi}{{2.4}{2}{Hardware Performance Counters and Energy}{subsection.2.4}{}} \@writefile{tdo}{\contentsline {todo}{Phase 2: Expand with PAPI and RAPL background once data is collected.}{2}{section*.6}\protected@file@percent } \pgfsyspdfmark {pgfid1}{20915651}{45096352} \pgfsyspdfmark {pgfid4}{38210436}{45099302} \pgfsyspdfmark {pgfid5}{38980483}{44906577} \@writefile{toc}{\contentsline {section}{\numberline {3}Methodology}{2}{section.3}\protected@file@percent } \newlabel{sec:methodology}{{3}{2}{Methodology}{section.3}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Implementation Source}{2}{subsection.3.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Compilation Variants}{2}{subsection.3.2}\protected@file@percent } \newlabel{sec:meth:variants}{{3.2}{2}{Compilation Variants}{subsection.3.2}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Benchmark Harness}{2}{subsection.3.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Hardware Platform}{2}{subsection.3.4}\protected@file@percent } \@writefile{tdo}{\contentsline {todo}{Phase 2: Hardware counter collection via PAPI.}{3}{section*.7}\protected@file@percent } \pgfsyspdfmark {pgfid6}{12703613}{37681124} \pgfsyspdfmark {pgfid7}{2015231}{37684074} \pgfsyspdfmark {pgfid8}{2785278}{37491349} \@writefile{toc}{\contentsline {subsection}{\numberline {3.5}Statistical Methodology}{3}{subsection.3.5}\protected@file@percent } \newlabel{sec:meth:stats}{{3.5}{3}{Statistical Methodology}{subsection.3.5}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.6}Energy Measurement}{3}{subsection.3.6}\protected@file@percent } \newlabel{sec:meth:energy}{{3.6}{3}{Energy Measurement}{subsection.3.6}{}} \@writefile{tdo}{\contentsline {todo}{Phase 2: Intel RAPL (pkg + DRAM domains), EDP computation, per-operation joules.}{3}{section*.8}\protected@file@percent } \pgfsyspdfmark {pgfid11}{3538944}{24335452} \pgfsyspdfmark {pgfid12}{2015231}{24338402} \pgfsyspdfmark {pgfid13}{2785278}{24145677} \@writefile{toc}{\contentsline {section}{\numberline {4}Results}{3}{section.4}\protected@file@percent } \newlabel{sec:results}{{4}{3}{Results}{section.4}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Cycle Count Distributions}{3}{subsection.4.1}\protected@file@percent } \newlabel{sec:results:distributions}{{4.1}{3}{Cycle Count Distributions}{subsection.4.1}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Speedup Decomposition}{3}{subsection.4.2}\protected@file@percent } \newlabel{sec:results:decomp}{{4.2}{3}{Speedup Decomposition}{subsection.4.2}{}} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Cycle count distributions for three representative ML-KEM-512 operations. Log $x$-axis. Dashed lines mark medians. Right-skew and outlier structure motivate nonparametric statistics.}}{3}{figure.caption.9}\protected@file@percent } \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} \newlabel{fig:distributions}{{1}{3}{Cycle count distributions for three representative \mlkemk {512} operations. Log $x$-axis. Dashed lines mark medians. Right-skew and outlier structure motivate nonparametric statistics}{figure.caption.9}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Hand-Written SIMD Speedup}{3}{subsection.4.3}\protected@file@percent } \newlabel{sec:results:simd}{{4.3}{3}{Hand-Written SIMD Speedup}{subsection.4.3}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Statistical Significance}{3}{subsection.4.4}\protected@file@percent } \newlabel{sec:results:stats}{{4.4}{3}{Statistical Significance}{subsection.4.4}{}} \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Cumulative speedup at each optimization stage, normalized to \texttt {refo0}{} (1×). Three bars per operation: \textcolor {colRefnv}{$\blacksquare $}\nonbreakingspace O3 no auto-vec, \textcolor {colRef}{$\blacksquare $}\nonbreakingspace O3 + auto-vec, \textcolor {colAvx}{$\blacksquare $}\nonbreakingspace O3 + hand SIMD (AVX2). Log $y$-axis; 95\% bootstrap CI shown on \texttt {avx2}{} bars. Sorted by \texttt {avx2}{} speedup.}}{4}{figure.caption.10}\protected@file@percent } \newlabel{fig:decomp}{{2}{4}{Cumulative speedup at each optimization stage, normalized to \varrefo {} (1×). Three bars per operation: \textcolor {colRefnv}{$\blacksquare $}~O3 no auto-vec, \textcolor {colRef}{$\blacksquare $}~O3 + auto-vec, \textcolor {colAvx}{$\blacksquare $}~O3 + hand SIMD (AVX2). Log $y$-axis; 95\% bootstrap CI shown on \varavx {} bars. Sorted by \varavx {} speedup}{figure.caption.10}{}} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Hand-written SIMD speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) per operation, across all three ML-KEM{} parameter sets. Log $y$-axis. 95\% bootstrap CI error bars (often sub-pixel). Sorted by ML-KEM-512 speedup.}}{4}{figure.caption.11}\protected@file@percent } \newlabel{fig:handsimd}{{3}{4}{Hand-written SIMD speedup (\varref {} $\to $ \varavx {}) per operation, across all three \mlkem {} parameter sets. Log $y$-axis. 95\% bootstrap CI error bars (often sub-pixel). Sorted by \mlkemk {512} speedup}{figure.caption.11}{}} \@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Hand-written SIMD speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}), median ratio with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$.}}{4}{table.caption.12}\protected@file@percent } \newlabel{tab:simd}{{1}{4}{Hand-written SIMD speedup (\varref {} $\to $ \varavx {}), median ratio with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$}{table.caption.12}{}} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Cliff's $\delta $ (\texttt {ref}{} vs.\ \texttt {avx2}{}) for all operations and parameter sets. $\delta = +1$: AVX2 is faster in every observation pair. Nearly all cells are at $+1.000$.}}{4}{figure.caption.13}\protected@file@percent } \newlabel{fig:cliffs}{{4}{4}{Cliff's $\delta $ (\varref {} vs.\ \varavx {}) for all operations and parameter sets. $\delta = +1$: AVX2 is faster in every observation pair. Nearly all cells are at $+1.000$}{figure.caption.13}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.5}Cross-Parameter Consistency}{4}{subsection.4.5}\protected@file@percent } \newlabel{sec:results:crossparams}{{4.5}{4}{Cross-Parameter Consistency}{subsection.4.5}{}} \@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Per-polynomial operation speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) across security parameters. Polynomial dimension is 256 for all; variation reflects cache-state differences in the calling context.}}{5}{figure.caption.14}\protected@file@percent } \newlabel{fig:crossparams}{{5}{5}{Per-polynomial operation speedup (\varref {} $\to $ \varavx {}) across security parameters. Polynomial dimension is 256 for all; variation reflects cache-state differences in the calling context}{figure.caption.14}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.6}Hardware Counter Breakdown}{5}{subsection.4.6}\protected@file@percent } \newlabel{sec:results:papi}{{4.6}{5}{Hardware Counter Breakdown}{subsection.4.6}{}} \@writefile{tdo}{\contentsline {todo}{Phase 2: IPC, L1/L2/L3 cache miss rates, branch mispredictions via PAPI. This section will contain bar charts of per-counter values comparing ref and avx2 for each operation, explaining the mechanistic origins of the speedup.}{5}{section*.15}\protected@file@percent } \pgfsyspdfmark {pgfid264}{3538944}{21389118} \pgfsyspdfmark {pgfid265}{2015231}{21392068} \pgfsyspdfmark {pgfid266}{2785278}{21199343} \@writefile{toc}{\contentsline {subsection}{\numberline {4.7}Energy Efficiency}{5}{subsection.4.7}\protected@file@percent } \newlabel{sec:results:energy}{{4.7}{5}{Energy Efficiency}{subsection.4.7}{}} \@writefile{tdo}{\contentsline {todo}{Phase 2: Intel RAPL pkg + DRAM energy readings per operation. EDP (energy-delay product) comparison. Energy per KEM operation.}{5}{section*.16}\protected@file@percent } \pgfsyspdfmark {pgfid269}{3538944}{19496559} \pgfsyspdfmark {pgfid270}{2015231}{-14840343} \pgfsyspdfmark {pgfid271}{2785278}{-15033068} \@writefile{toc}{\contentsline {section}{\numberline {5}Discussion}{5}{section.5}\protected@file@percent } \newlabel{sec:discussion}{{5}{5}{Discussion}{section.5}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Why Arithmetic Operations Benefit Most}{5}{subsection.5.1}\protected@file@percent } \@writefile{tdo}{\contentsline {todo}{Phase 2: Confirm with IPC and port utilisation counters.}{5}{section*.17}\protected@file@percent } \pgfsyspdfmark {pgfid274}{13184317}{5758368} \pgfsyspdfmark {pgfid275}{2015231}{-36522418} \pgfsyspdfmark {pgfid276}{2785278}{-36715143} \@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Why the Compiler Cannot Auto-Vectorise NTT}{5}{subsection.5.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Why SHAKE Operations Benefit Less}{5}{subsection.5.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Why Noise Sampling Barely Benefits}{5}{subsection.5.4}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {5.5}NTT Cache-State Variation Across Parameter Sets}{5}{subsection.5.5}\protected@file@percent } \@writefile{tdo}{\contentsline {todo}{Phase 2: Verify with L1/L2 miss counters split by scalar vs AVX2.}{5}{section*.18}\protected@file@percent } \pgfsyspdfmark {pgfid279}{25927376}{9612704} \pgfsyspdfmark {pgfid282}{38210436}{9615654} \pgfsyspdfmark {pgfid283}{38980483}{9422929} \@writefile{toc}{\contentsline {subsection}{\numberline {5.6}Implications for Deployment}{5}{subsection.5.6}\protected@file@percent } \citation{kyber-avx2} \citation{pqclean} \citation{kyber2018} \citation{fips203} \citation{pqm4} \citation{supercop} \citation{pqm4} \citation{gueron2014} \citation{bernstein2006} \citation{ntt-survey} \citation{cachetime} \citation{papi} \bibstyle{ACM-Reference-Format} \bibdata{refs} \bibcite{bernstein2006}{{1}{2006}{{Bernstein}}{{}}} \bibcite{supercop}{{2}{[n.\,d.]}{{Bernstein and Lange}}{{}}} \bibcite{cachetime}{{3}{2008}{{Bernstein and Schwabe}}{{}}} \bibcite{kyber2018}{{4}{2018}{{Bos et~al\mbox {.}}}{{}}} \bibcite{rapl}{{5}{2010}{{David et~al\mbox {.}}}{{}}} \bibcite{bettini2024}{{6}{2023}{{Google Security Blog}}{{}}} \@writefile{toc}{\contentsline {subsection}{\numberline {5.7}Limitations}{6}{subsection.5.7}\protected@file@percent } \@writefile{toc}{\contentsline {paragraph}{No hardware counter data (Phase\nonbreakingspace 1).}{6}{section*.19}\protected@file@percent } \@writefile{tdo}{\contentsline {todo}{Phase 2: PAPI counters: IPC, cache miss rates.}{6}{section*.20}\protected@file@percent } \pgfsyspdfmark {pgfid284}{16379392}{38731168} \pgfsyspdfmark {pgfid285}{2015231}{38734118} \pgfsyspdfmark {pgfid286}{2785278}{38541393} \@writefile{toc}{\contentsline {paragraph}{Single microarchitecture.}{6}{section*.21}\protected@file@percent } \@writefile{tdo}{\contentsline {todo}{Phase 3: Repeat on AMD Zen, ARM Graviton3, RISC-V.}{6}{section*.22}\protected@file@percent } \pgfsyspdfmark {pgfid289}{6791818}{34708896} \pgfsyspdfmark {pgfid290}{2015231}{32453210} \pgfsyspdfmark {pgfid291}{2785278}{32260485} \@writefile{toc}{\contentsline {paragraph}{Frequency scaling.}{6}{section*.23}\protected@file@percent } \@writefile{tdo}{\contentsline {todo}{Phase 2: Characterize frequency during benchmarks; consider RAPL-normalized cycle counts.}{6}{section*.24}\protected@file@percent } \pgfsyspdfmark {pgfid294}{6161296}{30686624} \pgfsyspdfmark {pgfid295}{2015231}{24009614} \pgfsyspdfmark {pgfid296}{2785278}{23816889} \@writefile{toc}{\contentsline {section}{\numberline {6}Related Work}{6}{section.6}\protected@file@percent } \newlabel{sec:related}{{6}{6}{Related Work}{section.6}{}} \@writefile{toc}{\contentsline {paragraph}{ML-KEM / Kyber implementations.}{6}{section*.25}\protected@file@percent } \@writefile{toc}{\contentsline {paragraph}{PQC benchmarking.}{6}{section*.26}\protected@file@percent } \@writefile{toc}{\contentsline {paragraph}{SIMD in cryptography.}{6}{section*.27}\protected@file@percent } \@writefile{toc}{\contentsline {paragraph}{NTT optimization.}{6}{section*.28}\protected@file@percent } \@writefile{toc}{\contentsline {paragraph}{Hardware counter profiling.}{6}{section*.29}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{6}{section.7}\protected@file@percent } \newlabel{sec:conclusion}{{7}{6}{Conclusion}{section.7}{}} \@writefile{toc}{\contentsline {paragraph}{Future work.}{6}{section*.30}\protected@file@percent } \@writefile{toc}{\contentsline {paragraph}{Artifact.}{6}{section*.31}\protected@file@percent } \@writefile{toc}{\contentsline {section}{References}{6}{section*.33}\protected@file@percent } \bibcite{gueron2014}{{7}{2013}{{Gueron and Krasnov}}{{}}} \bibcite{papi}{{8}{[n.\,d.]}{{Innovative Computing Laboratory, University of Tennessee}}{{}}} \bibcite{pqm4}{{9}{[n.\,d.]}{{Kannwischer et~al\mbox {.}}}{{}}} \bibcite{ntt-survey}{{10}{2016}{{Longa and Naehrig}}{{}}} \bibcite{fips204}{{11}{2024a}{{National Institute of Standards and Technology}}{{}}} \bibcite{fips203}{{12}{2024b}{{National Institute of Standards and Technology}}{{}}} \bibcite{fips205}{{13}{2024c}{{National Institute of Standards and Technology}}{{}}} \bibcite{pqclean}{{14}{[n.\,d.]}{{PQClean Contributors}}{{}}} \bibcite{kyber-avx2}{{15}{[n.\,d.]}{{Schwabe and Seiler}}{{}}} \newlabel{tocindent-1}{0pt} \newlabel{tocindent0}{0pt} \newlabel{tocindent1}{6.25499pt} \newlabel{tocindent2}{10.34999pt} \newlabel{tocindent3}{0pt} \newlabel{tocindent4}{0pt} \newlabel{tocindent5}{0pt} \@writefile{toc}{\contentsline {section}{\numberline {A}End-to-End KEM Speedup}{7}{appendix.A}\protected@file@percent } \newlabel{sec:supp:kem}{{A}{7}{End-to-End KEM Speedup}{appendix.A}{}} \@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces End-to-end KEM speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) for \texttt {kyber\_keypair}, \texttt {kyber\_encaps}, and \texttt {kyber\_decaps}. Intel Xeon Platinum 8268; 95\% bootstrap CI.}}{7}{figure.caption.34}\protected@file@percent } \newlabel{fig:kemlevel}{{6}{7}{End-to-end KEM speedup (\varref {} $\to $ \varavx {}) for \op {kyber\_keypair}, \op {kyber\_encaps}, and \op {kyber\_decaps}. Intel Xeon Platinum 8268; 95\% bootstrap CI}{figure.caption.34}{}} \@writefile{toc}{\contentsline {section}{\numberline {B}Full Operation Set}{7}{appendix.B}\protected@file@percent } \newlabel{sec:supp:fullops}{{B}{7}{Full Operation Set}{appendix.B}{}} \@writefile{tdo}{\contentsline {todo}{Full operation speedup table for all 20 benchmarked operations, including \texttt {poly\_compress}, \texttt {poly\_decompress}, \texttt {polyvec\_compress}, \texttt {poly\_tomsg}, and the \texttt {*\_derand} KEM variants.}{7}{section*.35}\protected@file@percent } \pgfsyspdfmark {pgfid319}{28801187}{27830541} \newlabel{TotPages}{{7}{7}{}{page.7}{}} \gdef \@abspage@last{7}