174 lines
16 KiB
TeX
174 lines
16 KiB
TeX
\relax
|
||
\providecommand\hyper@newdestlabel[2]{}
|
||
\providecommand\HyField@AuxAddToFields[1]{}
|
||
\providecommand\HyField@AuxAddToCoFields[2]{}
|
||
\citation{fips203,fips204,fips205}
|
||
\citation{bettini2024}
|
||
\citation{kyber-avx2}
|
||
\citation{fips203}
|
||
\citation{ntt-survey}
|
||
\@writefile{toc}{\contentsline {section}{Abstract}{1}{section*.1}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
|
||
\newlabel{sec:intro}{{1}{1}{Introduction}{section.1}{}}
|
||
\@writefile{toc}{\contentsline {section}{\numberline {2}Background}{1}{section.2}\protected@file@percent }
|
||
\newlabel{sec:background}{{2}{1}{Background}{section.2}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}ML-KEM and the Number Theoretic Transform}{1}{subsection.2.1}\protected@file@percent }
|
||
\citation{kyber-avx2}
|
||
\citation{papi}
|
||
\citation{rapl}
|
||
\citation{kyber-avx2}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}AVX2 SIMD on x86-64}{2}{subsection.2.2}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Compilation Variants}{2}{subsection.2.3}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Hardware Performance Counters and Energy}{2}{subsection.2.4}\protected@file@percent }
|
||
\newlabel{sec:bg:papi}{{2.4}{2}{Hardware Performance Counters and Energy}{subsection.2.4}{}}
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: Expand with PAPI and RAPL background once data is collected.}{2}{section*.6}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid1}{20915651}{45096352}
|
||
\pgfsyspdfmark {pgfid4}{38210436}{45099302}
|
||
\pgfsyspdfmark {pgfid5}{38980483}{44906577}
|
||
\@writefile{toc}{\contentsline {section}{\numberline {3}Methodology}{2}{section.3}\protected@file@percent }
|
||
\newlabel{sec:methodology}{{3}{2}{Methodology}{section.3}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Implementation Source}{2}{subsection.3.1}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Compilation Variants}{2}{subsection.3.2}\protected@file@percent }
|
||
\newlabel{sec:meth:variants}{{3.2}{2}{Compilation Variants}{subsection.3.2}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Benchmark Harness}{2}{subsection.3.3}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Hardware Platform}{2}{subsection.3.4}\protected@file@percent }
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: Hardware counter collection via PAPI.}{3}{section*.7}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid6}{12703613}{37681124}
|
||
\pgfsyspdfmark {pgfid7}{2015231}{37684074}
|
||
\pgfsyspdfmark {pgfid8}{2785278}{37491349}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {3.5}Statistical Methodology}{3}{subsection.3.5}\protected@file@percent }
|
||
\newlabel{sec:meth:stats}{{3.5}{3}{Statistical Methodology}{subsection.3.5}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {3.6}Energy Measurement}{3}{subsection.3.6}\protected@file@percent }
|
||
\newlabel{sec:meth:energy}{{3.6}{3}{Energy Measurement}{subsection.3.6}{}}
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: Intel RAPL (pkg + DRAM domains), EDP computation, per-operation joules.}{3}{section*.8}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid11}{3538944}{24335452}
|
||
\pgfsyspdfmark {pgfid12}{2015231}{24338402}
|
||
\pgfsyspdfmark {pgfid13}{2785278}{24145677}
|
||
\@writefile{toc}{\contentsline {section}{\numberline {4}Results}{3}{section.4}\protected@file@percent }
|
||
\newlabel{sec:results}{{4}{3}{Results}{section.4}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Cycle Count Distributions}{3}{subsection.4.1}\protected@file@percent }
|
||
\newlabel{sec:results:distributions}{{4.1}{3}{Cycle Count Distributions}{subsection.4.1}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Speedup Decomposition}{3}{subsection.4.2}\protected@file@percent }
|
||
\newlabel{sec:results:decomp}{{4.2}{3}{Speedup Decomposition}{subsection.4.2}{}}
|
||
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Cycle count distributions for three representative ML-KEM-512 operations. Log $x$-axis. Dashed lines mark medians. Right-skew and outlier structure motivate nonparametric statistics.}}{3}{figure.caption.9}\protected@file@percent }
|
||
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
|
||
\newlabel{fig:distributions}{{1}{3}{Cycle count distributions for three representative \mlkemk {512} operations. Log $x$-axis. Dashed lines mark medians. Right-skew and outlier structure motivate nonparametric statistics}{figure.caption.9}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Hand-Written SIMD Speedup}{3}{subsection.4.3}\protected@file@percent }
|
||
\newlabel{sec:results:simd}{{4.3}{3}{Hand-Written SIMD Speedup}{subsection.4.3}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Statistical Significance}{3}{subsection.4.4}\protected@file@percent }
|
||
\newlabel{sec:results:stats}{{4.4}{3}{Statistical Significance}{subsection.4.4}{}}
|
||
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Cumulative speedup at each optimization stage, normalized to \texttt {refo0}{} (1×). Three bars per operation: \textcolor {colRefnv}{$\blacksquare $}\nonbreakingspace O3 no auto-vec, \textcolor {colRef}{$\blacksquare $}\nonbreakingspace O3 + auto-vec, \textcolor {colAvx}{$\blacksquare $}\nonbreakingspace O3 + hand SIMD (AVX2). Log $y$-axis; 95\% bootstrap CI shown on \texttt {avx2}{} bars. Sorted by \texttt {avx2}{} speedup.}}{4}{figure.caption.10}\protected@file@percent }
|
||
\newlabel{fig:decomp}{{2}{4}{Cumulative speedup at each optimization stage, normalized to \varrefo {} (1×). Three bars per operation: \textcolor {colRefnv}{$\blacksquare $}~O3 no auto-vec, \textcolor {colRef}{$\blacksquare $}~O3 + auto-vec, \textcolor {colAvx}{$\blacksquare $}~O3 + hand SIMD (AVX2). Log $y$-axis; 95\% bootstrap CI shown on \varavx {} bars. Sorted by \varavx {} speedup}{figure.caption.10}{}}
|
||
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Hand-written SIMD speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) per operation, across all three ML-KEM{} parameter sets. Log $y$-axis. 95\% bootstrap CI error bars (often sub-pixel). Sorted by ML-KEM-512 speedup.}}{4}{figure.caption.11}\protected@file@percent }
|
||
\newlabel{fig:handsimd}{{3}{4}{Hand-written SIMD speedup (\varref {} $\to $ \varavx {}) per operation, across all three \mlkem {} parameter sets. Log $y$-axis. 95\% bootstrap CI error bars (often sub-pixel). Sorted by \mlkemk {512} speedup}{figure.caption.11}{}}
|
||
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Hand-written SIMD speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}), median ratio with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$.}}{4}{table.caption.12}\protected@file@percent }
|
||
\newlabel{tab:simd}{{1}{4}{Hand-written SIMD speedup (\varref {} $\to $ \varavx {}), median ratio with 95\% bootstrap CI. All Cliff's $\delta = +1.000$, $p < 10^{-300}$}{table.caption.12}{}}
|
||
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Cliff's $\delta $ (\texttt {ref}{} vs.\ \texttt {avx2}{}) for all operations and parameter sets. $\delta = +1$: AVX2 is faster in every observation pair. Nearly all cells are at $+1.000$.}}{4}{figure.caption.13}\protected@file@percent }
|
||
\newlabel{fig:cliffs}{{4}{4}{Cliff's $\delta $ (\varref {} vs.\ \varavx {}) for all operations and parameter sets. $\delta = +1$: AVX2 is faster in every observation pair. Nearly all cells are at $+1.000$}{figure.caption.13}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {4.5}Cross-Parameter Consistency}{4}{subsection.4.5}\protected@file@percent }
|
||
\newlabel{sec:results:crossparams}{{4.5}{4}{Cross-Parameter Consistency}{subsection.4.5}{}}
|
||
\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Per-polynomial operation speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) across security parameters. Polynomial dimension is 256 for all; variation reflects cache-state differences in the calling context.}}{5}{figure.caption.14}\protected@file@percent }
|
||
\newlabel{fig:crossparams}{{5}{5}{Per-polynomial operation speedup (\varref {} $\to $ \varavx {}) across security parameters. Polynomial dimension is 256 for all; variation reflects cache-state differences in the calling context}{figure.caption.14}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {4.6}Hardware Counter Breakdown}{5}{subsection.4.6}\protected@file@percent }
|
||
\newlabel{sec:results:papi}{{4.6}{5}{Hardware Counter Breakdown}{subsection.4.6}{}}
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: IPC, L1/L2/L3 cache miss rates, branch mispredictions via PAPI. This section will contain bar charts of per-counter values comparing ref and avx2 for each operation, explaining the mechanistic origins of the speedup.}{5}{section*.15}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid264}{3538944}{21389118}
|
||
\pgfsyspdfmark {pgfid265}{2015231}{21392068}
|
||
\pgfsyspdfmark {pgfid266}{2785278}{21199343}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {4.7}Energy Efficiency}{5}{subsection.4.7}\protected@file@percent }
|
||
\newlabel{sec:results:energy}{{4.7}{5}{Energy Efficiency}{subsection.4.7}{}}
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: Intel RAPL pkg + DRAM energy readings per operation. EDP (energy-delay product) comparison. Energy per KEM operation.}{5}{section*.16}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid269}{3538944}{19496559}
|
||
\pgfsyspdfmark {pgfid270}{2015231}{-14840343}
|
||
\pgfsyspdfmark {pgfid271}{2785278}{-15033068}
|
||
\@writefile{toc}{\contentsline {section}{\numberline {5}Discussion}{5}{section.5}\protected@file@percent }
|
||
\newlabel{sec:discussion}{{5}{5}{Discussion}{section.5}{}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Why Arithmetic Operations Benefit Most}{5}{subsection.5.1}\protected@file@percent }
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: Confirm with IPC and port utilisation counters.}{5}{section*.17}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid274}{13184317}{5758368}
|
||
\pgfsyspdfmark {pgfid275}{2015231}{-36522418}
|
||
\pgfsyspdfmark {pgfid276}{2785278}{-36715143}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Why the Compiler Cannot Auto-Vectorise NTT}{5}{subsection.5.2}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Why SHAKE Operations Benefit Less}{5}{subsection.5.3}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Why Noise Sampling Barely Benefits}{5}{subsection.5.4}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5}NTT Cache-State Variation Across Parameter Sets}{5}{subsection.5.5}\protected@file@percent }
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: Verify with L1/L2 miss counters split by scalar vs AVX2.}{5}{section*.18}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid279}{25927376}{9612704}
|
||
\pgfsyspdfmark {pgfid282}{38210436}{9615654}
|
||
\pgfsyspdfmark {pgfid283}{38980483}{9422929}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {5.6}Implications for Deployment}{5}{subsection.5.6}\protected@file@percent }
|
||
\citation{kyber-avx2}
|
||
\citation{pqclean}
|
||
\citation{kyber2018}
|
||
\citation{fips203}
|
||
\citation{pqm4}
|
||
\citation{supercop}
|
||
\citation{pqm4}
|
||
\citation{gueron2014}
|
||
\citation{bernstein2006}
|
||
\citation{ntt-survey}
|
||
\citation{cachetime}
|
||
\citation{papi}
|
||
\bibstyle{ACM-Reference-Format}
|
||
\bibdata{refs}
|
||
\bibcite{bernstein2006}{{1}{2006}{{Bernstein}}{{}}}
|
||
\bibcite{supercop}{{2}{[n.\,d.]}{{Bernstein and Lange}}{{}}}
|
||
\bibcite{cachetime}{{3}{2008}{{Bernstein and Schwabe}}{{}}}
|
||
\bibcite{kyber2018}{{4}{2018}{{Bos et~al\mbox {.}}}{{}}}
|
||
\bibcite{rapl}{{5}{2010}{{David et~al\mbox {.}}}{{}}}
|
||
\bibcite{bettini2024}{{6}{2023}{{Google Security Blog}}{{}}}
|
||
\@writefile{toc}{\contentsline {subsection}{\numberline {5.7}Limitations}{6}{subsection.5.7}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {paragraph}{No hardware counter data (Phase\nonbreakingspace 1).}{6}{section*.19}\protected@file@percent }
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: PAPI counters: IPC, cache miss rates.}{6}{section*.20}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid284}{16379392}{38731168}
|
||
\pgfsyspdfmark {pgfid285}{2015231}{38734118}
|
||
\pgfsyspdfmark {pgfid286}{2785278}{38541393}
|
||
\@writefile{toc}{\contentsline {paragraph}{Single microarchitecture.}{6}{section*.21}\protected@file@percent }
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 3: Repeat on AMD Zen, ARM Graviton3, RISC-V.}{6}{section*.22}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid289}{6791818}{34708896}
|
||
\pgfsyspdfmark {pgfid290}{2015231}{32453210}
|
||
\pgfsyspdfmark {pgfid291}{2785278}{32260485}
|
||
\@writefile{toc}{\contentsline {paragraph}{Frequency scaling.}{6}{section*.23}\protected@file@percent }
|
||
\@writefile{tdo}{\contentsline {todo}{Phase 2: Characterize frequency during benchmarks; consider RAPL-normalized cycle counts.}{6}{section*.24}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid294}{6161296}{30686624}
|
||
\pgfsyspdfmark {pgfid295}{2015231}{24009614}
|
||
\pgfsyspdfmark {pgfid296}{2785278}{23816889}
|
||
\@writefile{toc}{\contentsline {section}{\numberline {6}Related Work}{6}{section.6}\protected@file@percent }
|
||
\newlabel{sec:related}{{6}{6}{Related Work}{section.6}{}}
|
||
\@writefile{toc}{\contentsline {paragraph}{ML-KEM / Kyber implementations.}{6}{section*.25}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {paragraph}{PQC benchmarking.}{6}{section*.26}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {paragraph}{SIMD in cryptography.}{6}{section*.27}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {paragraph}{NTT optimization.}{6}{section*.28}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {paragraph}{Hardware counter profiling.}{6}{section*.29}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{6}{section.7}\protected@file@percent }
|
||
\newlabel{sec:conclusion}{{7}{6}{Conclusion}{section.7}{}}
|
||
\@writefile{toc}{\contentsline {paragraph}{Future work.}{6}{section*.30}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {paragraph}{Artifact.}{6}{section*.31}\protected@file@percent }
|
||
\@writefile{toc}{\contentsline {section}{References}{6}{section*.33}\protected@file@percent }
|
||
\bibcite{gueron2014}{{7}{2013}{{Gueron and Krasnov}}{{}}}
|
||
\bibcite{papi}{{8}{[n.\,d.]}{{Innovative Computing Laboratory, University of Tennessee}}{{}}}
|
||
\bibcite{pqm4}{{9}{[n.\,d.]}{{Kannwischer et~al\mbox {.}}}{{}}}
|
||
\bibcite{ntt-survey}{{10}{2016}{{Longa and Naehrig}}{{}}}
|
||
\bibcite{fips204}{{11}{2024a}{{National Institute of Standards and Technology}}{{}}}
|
||
\bibcite{fips203}{{12}{2024b}{{National Institute of Standards and Technology}}{{}}}
|
||
\bibcite{fips205}{{13}{2024c}{{National Institute of Standards and Technology}}{{}}}
|
||
\bibcite{pqclean}{{14}{[n.\,d.]}{{PQClean Contributors}}{{}}}
|
||
\bibcite{kyber-avx2}{{15}{[n.\,d.]}{{Schwabe and Seiler}}{{}}}
|
||
\newlabel{tocindent-1}{0pt}
|
||
\newlabel{tocindent0}{0pt}
|
||
\newlabel{tocindent1}{6.25499pt}
|
||
\newlabel{tocindent2}{10.34999pt}
|
||
\newlabel{tocindent3}{0pt}
|
||
\newlabel{tocindent4}{0pt}
|
||
\newlabel{tocindent5}{0pt}
|
||
\@writefile{toc}{\contentsline {section}{\numberline {A}End-to-End KEM Speedup}{7}{appendix.A}\protected@file@percent }
|
||
\newlabel{sec:supp:kem}{{A}{7}{End-to-End KEM Speedup}{appendix.A}{}}
|
||
\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces End-to-end KEM speedup (\texttt {ref}{} $\to $ \texttt {avx2}{}) for \texttt {kyber\_keypair}, \texttt {kyber\_encaps}, and \texttt {kyber\_decaps}. Intel Xeon Platinum 8268; 95\% bootstrap CI.}}{7}{figure.caption.34}\protected@file@percent }
|
||
\newlabel{fig:kemlevel}{{6}{7}{End-to-end KEM speedup (\varref {} $\to $ \varavx {}) for \op {kyber\_keypair}, \op {kyber\_encaps}, and \op {kyber\_decaps}. Intel Xeon Platinum 8268; 95\% bootstrap CI}{figure.caption.34}{}}
|
||
\@writefile{toc}{\contentsline {section}{\numberline {B}Full Operation Set}{7}{appendix.B}\protected@file@percent }
|
||
\newlabel{sec:supp:fullops}{{B}{7}{Full Operation Set}{appendix.B}{}}
|
||
\@writefile{tdo}{\contentsline {todo}{Full operation speedup table for all 20 benchmarked operations, including \texttt {poly\_compress}, \texttt {poly\_decompress}, \texttt {polyvec\_compress}, \texttt {poly\_tomsg}, and the \texttt {*\_derand} KEM variants.}{7}{section*.35}\protected@file@percent }
|
||
\pgfsyspdfmark {pgfid319}{28801187}{27830541}
|
||
\newlabel{TotPages}{{7}{7}{}{page.7}{}}
|
||
\gdef \@abspage@last{7}
|