levineuwirth.org/paper/main.aux

174 lines
16 KiB
TeX
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

\relax
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\citation{fips203,fips204,fips205}
\citation{bettini2024}
\citation{kyber-avx2}
\citation{fips203}
\citation{ntt-survey}
\@writefile{toc}{\contentsline {section}{Abstract}{1}{section*.1}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
\newlabel{sec:intro}{{1}{1}{Introduction}{section.1}{}}
\@writefile{toc}{\contentsline {section}{\numberline {2}Background}{1}{section.2}\protected@file@percent }
\newlabel{sec:background}{{2}{1}{Background}{section.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}ML-KEM and the Number Theoretic Transform}{1}{subsection.2.1}\protected@file@percent }
\citation{kyber-avx2}
\citation{papi}
\citation{rapl}
\citation{kyber-avx2}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}AVX2 SIMD on x86-64}{2}{subsection.2.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Compilation Variants}{2}{subsection.2.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Hardware Performance Counters and Energy}{2}{subsection.2.4}\protected@file@percent }
\newlabel{sec:bg:papi}{{2.4}{2}{Hardware Performance Counters and Energy}{subsection.2.4}{}}
\@writefile{tdo}{\contentsline {todo}{Phase 2: Expand with PAPI and RAPL background once data is collected.}{2}{section*.6}\protected@file@percent }
\pgfsyspdfmark {pgfid1}{20915651}{45096352}
\pgfsyspdfmark {pgfid4}{38210436}{45099302}
\pgfsyspdfmark {pgfid5}{38980483}{44906577}
\@writefile{toc}{\contentsline {section}{\numberline {3}Methodology}{2}{section.3}\protected@file@percent }
\newlabel{sec:methodology}{{3}{2}{Methodology}{section.3}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Implementation Source}{2}{subsection.3.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Compilation Variants}{2}{subsection.3.2}\protected@file@percent }
\newlabel{sec:meth:variants}{{3.2}{2}{Compilation Variants}{subsection.3.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Benchmark Harness}{2}{subsection.3.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Hardware Platform}{2}{subsection.3.4}\protected@file@percent }
\@writefile{tdo}{\contentsline {todo}{Phase 2: Hardware counter collection via PAPI.}{3}{section*.7}\protected@file@percent }
\pgfsyspdfmark {pgfid6}{12703613}{37681124}
\pgfsyspdfmark {pgfid7}{2015231}{37684074}
\pgfsyspdfmark {pgfid8}{2785278}{37491349}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.5}Statistical Methodology}{3}{subsection.3.5}\protected@file@percent }
\newlabel{sec:meth:stats}{{3.5}{3}{Statistical Methodology}{subsection.3.5}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.6}Energy Measurement}{3}{subsection.3.6}\protected@file@percent }
\newlabel{sec:meth:energy}{{3.6}{3}{Energy Measurement}{subsection.3.6}{}}
\@writefile{tdo}{\contentsline {todo}{Phase 2: Intel RAPL (pkg + DRAM domains), EDP computation, per-operation joules.}{3}{section*.8}\protected@file@percent }
\pgfsyspdfmark {pgfid11}{3538944}{24335452}
\pgfsyspdfmark {pgfid12}{2015231}{24338402}
\pgfsyspdfmark {pgfid13}{2785278}{24145677}
\@writefile{toc}{\contentsline {section}{\numberline {4}Results}{3}{section.4}\protected@file@percent }
\newlabel{sec:results}{{4}{3}{Results}{section.4}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Cycle Count Distributions}{3}{subsection.4.1}\protected@file@percent }
\newlabel{sec:results:distributions}{{4.1}{3}{Cycle Count Distributions}{subsection.4.1}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Speedup Decomposition}{3}{subsection.4.2}\protected@file@percent }
\newlabel{sec:results:decomp}{{4.2}{3}{Speedup Decomposition}{subsection.4.2}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Cycle count distributions for three representative ML-KEM-512 operations. Log $x$-axis. Dashed lines mark medians. Right-skew and outlier structure motivate nonparametric statistics.}}{3}{figure.caption.9}\protected@file@percent }
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:distributions}{{1}{3}{Cycle count distributions for three representative \mlkemk {512} operations. Log $x$-axis. Dashed lines mark medians. Right-skew and outlier structure motivate nonparametric statistics}{figure.caption.9}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Hand-Written SIMD Speedup}{3}{subsection.4.3}\protected@file@percent }
\newlabel{sec:results:simd}{{4.3}{3}{Hand-Written SIMD Speedup}{subsection.4.3}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Statistical Significance}{3}{subsection.4.4}\protected@file@percent }
\newlabel{sec:results:stats}{{4.4}{3}{Statistical Significance}{subsection.4.4}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Cumulative speedup at each optimization stage, normalized to \texttt {refo0}{} (1×). Three bars per operation: \textcolor {colRefnv}{$\blacksquare $}\nonbreakingspace O3 no auto-vec, \textcolor {colRef}{$\blacksquare $}\nonbreakingspace O3 + auto-vec, \textcolor {colAvx}{$\blacksquare $}\nonbreakingspace O3 + hand SIMD (AVX2). Log $y$-axis; 95\% bootstrap CI shown on \texttt {avx2}{} bars. Sorted by \texttt {avx2}{} speedup.}}{4}{figure.caption.10}\protected@file@percent }
\newlabel{fig:decomp}{{2}{4}{Cumulative speedup at each optimization stage, normalized to \varrefo {} (1×). Three bars per operation: \textcolor {colRefnv}{$\blacksquare $}~O3 no auto-vec, \textcolor {colRef}{$\blacksquare $}~O3 + auto-vec, \textcolor {colAvx}{$\blacksquare $}~O3 + hand SIMD (AVX2). Log $y$-axis; 95\% bootstrap CI shown on \varavx {} bars. Sorted by \varavx {} speedup}{figure.caption.10}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Hand-written SIMD speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) per operation, across all three ML-KEM{} parameter sets. Log $y$-axis. 95\% bootstrap CI error bars (often sub-pixel). Sorted by ML-KEM-512 speedup.}}{4}{figure.caption.11}\protected@file@percent }
\newlabel{fig:handsimd}{{3}{4}{Hand-written SIMD speedup (\varref {} $\to $ \varavx {}) per operation, across all three \mlkem {} parameter sets. Log $y$-axis. 95\% bootstrap CI error bars (often sub-pixel). Sorted by \mlkemk {512} speedup}{figure.caption.11}{}}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Hand-written SIMD speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}), median ratio with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$.}}{4}{table.caption.12}\protected@file@percent }
\newlabel{tab:simd}{{1}{4}{Hand-written SIMD speedup (\varref {} $\to $ \varavx {}), median ratio with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$}{table.caption.12}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Cliff's $\delta $ (\texttt {ref}{} vs.\ \texttt {avx2}{}) for all operations and parameter sets. $\delta = +1$: AVX2 is faster in every observation pair. Nearly all cells are at $+1.000$.}}{4}{figure.caption.13}\protected@file@percent }
\newlabel{fig:cliffs}{{4}{4}{Cliff's $\delta $ (\varref {} vs.\ \varavx {}) for all operations and parameter sets. $\delta = +1$: AVX2 is faster in every observation pair. Nearly all cells are at $+1.000$}{figure.caption.13}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.5}Cross-Parameter Consistency}{4}{subsection.4.5}\protected@file@percent }
\newlabel{sec:results:crossparams}{{4.5}{4}{Cross-Parameter Consistency}{subsection.4.5}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Per-polynomial operation speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) across security parameters. Polynomial dimension is 256 for all; variation reflects cache-state differences in the calling context.}}{5}{figure.caption.14}\protected@file@percent }
\newlabel{fig:crossparams}{{5}{5}{Per-polynomial operation speedup (\varref {} $\to $ \varavx {}) across security parameters. Polynomial dimension is 256 for all; variation reflects cache-state differences in the calling context}{figure.caption.14}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.6}Hardware Counter Breakdown}{5}{subsection.4.6}\protected@file@percent }
\newlabel{sec:results:papi}{{4.6}{5}{Hardware Counter Breakdown}{subsection.4.6}{}}
\@writefile{tdo}{\contentsline {todo}{Phase 2: IPC, L1/L2/L3 cache miss rates, branch mispredictions via PAPI. This section will contain bar charts of per-counter values comparing ref and avx2 for each operation, explaining the mechanistic origins of the speedup.}{5}{section*.15}\protected@file@percent }
\pgfsyspdfmark {pgfid264}{3538944}{21389118}
\pgfsyspdfmark {pgfid265}{2015231}{21392068}
\pgfsyspdfmark {pgfid266}{2785278}{21199343}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.7}Energy Efficiency}{5}{subsection.4.7}\protected@file@percent }
\newlabel{sec:results:energy}{{4.7}{5}{Energy Efficiency}{subsection.4.7}{}}
\@writefile{tdo}{\contentsline {todo}{Phase 2: Intel RAPL pkg + DRAM energy readings per operation. EDP (energy-delay product) comparison. Energy per KEM operation.}{5}{section*.16}\protected@file@percent }
\pgfsyspdfmark {pgfid269}{3538944}{19496559}
\pgfsyspdfmark {pgfid270}{2015231}{-14840343}
\pgfsyspdfmark {pgfid271}{2785278}{-15033068}
\@writefile{toc}{\contentsline {section}{\numberline {5}Discussion}{5}{section.5}\protected@file@percent }
\newlabel{sec:discussion}{{5}{5}{Discussion}{section.5}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Why Arithmetic Operations Benefit Most}{5}{subsection.5.1}\protected@file@percent }
\@writefile{tdo}{\contentsline {todo}{Phase 2: Confirm with IPC and port utilisation counters.}{5}{section*.17}\protected@file@percent }
\pgfsyspdfmark {pgfid274}{13184317}{5758368}
\pgfsyspdfmark {pgfid275}{2015231}{-36522418}
\pgfsyspdfmark {pgfid276}{2785278}{-36715143}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Why the Compiler Cannot Auto-Vectorise NTT}{5}{subsection.5.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Why SHAKE Operations Benefit Less}{5}{subsection.5.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Why Noise Sampling Barely Benefits}{5}{subsection.5.4}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5}NTT Cache-State Variation Across Parameter Sets}{5}{subsection.5.5}\protected@file@percent }
\@writefile{tdo}{\contentsline {todo}{Phase 2: Verify with L1/L2 miss counters split by scalar vs AVX2.}{5}{section*.18}\protected@file@percent }
\pgfsyspdfmark {pgfid279}{25927376}{9612704}
\pgfsyspdfmark {pgfid282}{38210436}{9615654}
\pgfsyspdfmark {pgfid283}{38980483}{9422929}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.6}Implications for Deployment}{5}{subsection.5.6}\protected@file@percent }
\citation{kyber-avx2}
\citation{pqclean}
\citation{kyber2018}
\citation{fips203}
\citation{pqm4}
\citation{supercop}
\citation{pqm4}
\citation{gueron2014}
\citation{bernstein2006}
\citation{ntt-survey}
\citation{cachetime}
\citation{papi}
\bibstyle{ACM-Reference-Format}
\bibdata{refs}
\bibcite{bernstein2006}{{1}{2006}{{Bernstein}}{{}}}
\bibcite{supercop}{{2}{[n.\,d.]}{{Bernstein and Lange}}{{}}}
\bibcite{cachetime}{{3}{2008}{{Bernstein and Schwabe}}{{}}}
\bibcite{kyber2018}{{4}{2018}{{Bos et~al\mbox {.}}}{{}}}
\bibcite{rapl}{{5}{2010}{{David et~al\mbox {.}}}{{}}}
\bibcite{bettini2024}{{6}{2023}{{Google Security Blog}}{{}}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.7}Limitations}{6}{subsection.5.7}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{No hardware counter data (Phase\nonbreakingspace 1).}{6}{section*.19}\protected@file@percent }
\@writefile{tdo}{\contentsline {todo}{Phase 2: PAPI counters: IPC, cache miss rates.}{6}{section*.20}\protected@file@percent }
\pgfsyspdfmark {pgfid284}{16379392}{38731168}
\pgfsyspdfmark {pgfid285}{2015231}{38734118}
\pgfsyspdfmark {pgfid286}{2785278}{38541393}
\@writefile{toc}{\contentsline {paragraph}{Single microarchitecture.}{6}{section*.21}\protected@file@percent }
\@writefile{tdo}{\contentsline {todo}{Phase 3: Repeat on AMD Zen, ARM Graviton3, RISC-V.}{6}{section*.22}\protected@file@percent }
\pgfsyspdfmark {pgfid289}{6791818}{34708896}
\pgfsyspdfmark {pgfid290}{2015231}{32453210}
\pgfsyspdfmark {pgfid291}{2785278}{32260485}
\@writefile{toc}{\contentsline {paragraph}{Frequency scaling.}{6}{section*.23}\protected@file@percent }
\@writefile{tdo}{\contentsline {todo}{Phase 2: Characterize frequency during benchmarks; consider RAPL-normalized cycle counts.}{6}{section*.24}\protected@file@percent }
\pgfsyspdfmark {pgfid294}{6161296}{30686624}
\pgfsyspdfmark {pgfid295}{2015231}{24009614}
\pgfsyspdfmark {pgfid296}{2785278}{23816889}
\@writefile{toc}{\contentsline {section}{\numberline {6}Related Work}{6}{section.6}\protected@file@percent }
\newlabel{sec:related}{{6}{6}{Related Work}{section.6}{}}
\@writefile{toc}{\contentsline {paragraph}{ML-KEM / Kyber implementations.}{6}{section*.25}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{PQC benchmarking.}{6}{section*.26}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{SIMD in cryptography.}{6}{section*.27}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{NTT optimization.}{6}{section*.28}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Hardware counter profiling.}{6}{section*.29}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{6}{section.7}\protected@file@percent }
\newlabel{sec:conclusion}{{7}{6}{Conclusion}{section.7}{}}
\@writefile{toc}{\contentsline {paragraph}{Future work.}{6}{section*.30}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Artifact.}{6}{section*.31}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{References}{6}{section*.33}\protected@file@percent }
\bibcite{gueron2014}{{7}{2013}{{Gueron and Krasnov}}{{}}}
\bibcite{papi}{{8}{[n.\,d.]}{{Innovative Computing Laboratory, University of Tennessee}}{{}}}
\bibcite{pqm4}{{9}{[n.\,d.]}{{Kannwischer et~al\mbox {.}}}{{}}}
\bibcite{ntt-survey}{{10}{2016}{{Longa and Naehrig}}{{}}}
\bibcite{fips204}{{11}{2024a}{{National Institute of Standards and Technology}}{{}}}
\bibcite{fips203}{{12}{2024b}{{National Institute of Standards and Technology}}{{}}}
\bibcite{fips205}{{13}{2024c}{{National Institute of Standards and Technology}}{{}}}
\bibcite{pqclean}{{14}{[n.\,d.]}{{PQClean Contributors}}{{}}}
\bibcite{kyber-avx2}{{15}{[n.\,d.]}{{Schwabe and Seiler}}{{}}}
\newlabel{tocindent-1}{0pt}
\newlabel{tocindent0}{0pt}
\newlabel{tocindent1}{6.25499pt}
\newlabel{tocindent2}{10.34999pt}
\newlabel{tocindent3}{0pt}
\newlabel{tocindent4}{0pt}
\newlabel{tocindent5}{0pt}
\@writefile{toc}{\contentsline {section}{\numberline {A}End-to-End KEM Speedup}{7}{appendix.A}\protected@file@percent }
\newlabel{sec:supp:kem}{{A}{7}{End-to-End KEM Speedup}{appendix.A}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces End-to-end KEM speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) for \texttt {kyber\_keypair}, \texttt {kyber\_encaps}, and \texttt {kyber\_decaps}. Intel Xeon Platinum 8268; 95\% bootstrap CI.}}{7}{figure.caption.34}\protected@file@percent }
\newlabel{fig:kemlevel}{{6}{7}{End-to-end KEM speedup (\varref {} $\to $ \varavx {}) for \op {kyber\_keypair}, \op {kyber\_encaps}, and \op {kyber\_decaps}. Intel Xeon Platinum 8268; 95\% bootstrap CI}{figure.caption.34}{}}
\@writefile{toc}{\contentsline {section}{\numberline {B}Full Operation Set}{7}{appendix.B}\protected@file@percent }
\newlabel{sec:supp:fullops}{{B}{7}{Full Operation Set}{appendix.B}{}}
\@writefile{tdo}{\contentsline {todo}{Full operation speedup table for all 20 benchmarked operations, including \texttt {poly\_compress}, \texttt {poly\_decompress}, \texttt {polyvec\_compress}, \texttt {poly\_tomsg}, and the \texttt {*\_derand} KEM variants.}{7}{section*.35}\protected@file@percent }
\pgfsyspdfmark {pgfid319}{28801187}{27830541}
\newlabel{TotPages}{{7}{7}{}{page.7}{}}
\gdef \@abspage@last{7}