where-simd-helps/harness/CMakeLists.txt

158 lines
5.2 KiB
CMake

cmake_minimum_required(VERSION 3.20)
project(pqc-bench C ASM)
set(CMAKE_C_STANDARD 11)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
set(KYBER_ROOT ${CMAKE_SOURCE_DIR}/../algorithms/kyber)
# ── Helpers shared across variants ──────────────────────────────────────────
# cpucycles / speed_print live in the kyber ref test dir; both variants use
# the same copies (avx2/test/ has identical files).
set(BENCH_HELPERS
${KYBER_ROOT}/ref/test/cpucycles.c
${KYBER_ROOT}/ref/test/speed_print.c
)
set(HARNESS_SRC ${CMAKE_SOURCE_DIR}/src/test_speed.c)
# ── ref sources (pure C, portable) ──────────────────────────────────────────
set(REF_DIR ${KYBER_ROOT}/ref)
set(REF_SOURCES
${REF_DIR}/kem.c
${REF_DIR}/indcpa.c
${REF_DIR}/polyvec.c
${REF_DIR}/poly.c
${REF_DIR}/ntt.c
${REF_DIR}/cbd.c
${REF_DIR}/reduce.c
${REF_DIR}/verify.c
${REF_DIR}/fips202.c
${REF_DIR}/symmetric-shake.c
${REF_DIR}/randombytes.c
)
# ── avx2 sources (C + x86 assembly) ─────────────────────────────────────────
set(AVX2_DIR ${KYBER_ROOT}/avx2)
set(AVX2_SOURCES
${AVX2_DIR}/kem.c
${AVX2_DIR}/indcpa.c
${AVX2_DIR}/polyvec.c
${AVX2_DIR}/poly.c
${AVX2_DIR}/cbd.c
${AVX2_DIR}/verify.c
${AVX2_DIR}/fips202.c
${AVX2_DIR}/fips202x4.c
${AVX2_DIR}/symmetric-shake.c
${AVX2_DIR}/randombytes.c
${AVX2_DIR}/consts.c
${AVX2_DIR}/rejsample.c
${AVX2_DIR}/fq.S
${AVX2_DIR}/shuffle.S
${AVX2_DIR}/ntt.S
${AVX2_DIR}/invntt.S
${AVX2_DIR}/basemul.S
${AVX2_DIR}/keccak4x/KeccakP-1600-times4-SIMD256.c
)
# ── KYBER_K mapping ──────────────────────────────────────────────────────────
# 512 → K=2, 768 → K=3, 1024 → K=4
set(KYBER_K_512 2)
set(KYBER_K_768 3)
set(KYBER_K_1024 4)
# ── Build targets ────────────────────────────────────────────────────────────
foreach(LEVEL 512 768 1024)
set(K ${KYBER_K_${LEVEL}})
# ref — optimised reference (O3, auto-vectorisation enabled)
set(REF_TARGET bench_mlkem${LEVEL}_ref)
add_executable(${REF_TARGET}
${HARNESS_SRC}
${REF_SOURCES}
${BENCH_HELPERS}
)
target_include_directories(${REF_TARGET} PRIVATE
${REF_DIR}
${REF_DIR}/test
)
target_compile_definitions(${REF_TARGET} PRIVATE KYBER_K=${K})
target_compile_options(${REF_TARGET} PRIVATE -O3 -fomit-frame-pointer)
# refnv — ref with auto-vectorisation disabled; isolates scalar O3 performance
set(REFNV_TARGET bench_mlkem${LEVEL}_refnv)
add_executable(${REFNV_TARGET}
${HARNESS_SRC}
${REF_SOURCES}
${BENCH_HELPERS}
)
target_include_directories(${REFNV_TARGET} PRIVATE
${REF_DIR}
${REF_DIR}/test
)
target_compile_definitions(${REFNV_TARGET} PRIVATE KYBER_K=${K})
target_compile_options(${REFNV_TARGET} PRIVATE
-O3 -fomit-frame-pointer -fno-tree-vectorize
)
# refo0 — ref at -O0; establishes unoptimised baseline
set(REFO0_TARGET bench_mlkem${LEVEL}_refo0)
add_executable(${REFO0_TARGET}
${HARNESS_SRC}
${REF_SOURCES}
${BENCH_HELPERS}
)
target_include_directories(${REFO0_TARGET} PRIVATE
${REF_DIR}
${REF_DIR}/test
)
target_compile_definitions(${REFO0_TARGET} PRIVATE KYBER_K=${K})
target_compile_options(${REFO0_TARGET} PRIVATE -O0)
# avx2 — hand-written AVX2 assembly + O3
set(AVX2_TARGET bench_mlkem${LEVEL}_avx2)
add_executable(${AVX2_TARGET}
${HARNESS_SRC}
${AVX2_SOURCES}
${BENCH_HELPERS}
)
target_include_directories(${AVX2_TARGET} PRIVATE
${AVX2_DIR}
${AVX2_DIR}/test
${AVX2_DIR}/keccak4x
)
target_compile_definitions(${AVX2_TARGET} PRIVATE KYBER_K=${K})
target_compile_options(${AVX2_TARGET} PRIVATE
-O3 -fomit-frame-pointer -mavx2 -mbmi2 -mpopcnt -march=native -mtune=native
)
endforeach()
# ── PAPI (hardware performance counters) ─────────────────────────────────────
option(WITH_PAPI "Link against PAPI for hardware counter collection" OFF)
if(WITH_PAPI)
find_library(PAPI_LIB papi REQUIRED)
find_path(PAPI_INCLUDE papi.h REQUIRED)
foreach(LEVEL 512 768 1024)
foreach(VARIANT ref refnv refo0 avx2)
set(T bench_mlkem${LEVEL}_${VARIANT})
target_include_directories(${T} PRIVATE ${PAPI_INCLUDE})
target_link_libraries(${T} ${PAPI_LIB})
target_compile_definitions(${T} PRIVATE WITH_PAPI)
endforeach()
endforeach()
endif()
# ── RAPL energy measurement ──────────────────────────────────────────────────
# Requires root or CAP_SYS_RAWIO on the benchmark node.
option(WITH_RAPL "Enable Intel RAPL energy measurement" OFF)
if(WITH_RAPL)
foreach(LEVEL 512 768 1024)
foreach(VARIANT ref refnv refo0 avx2)
target_compile_definitions(bench_mlkem${LEVEL}_${VARIANT} PRIVATE WITH_RAPL)
endforeach()
endforeach()
endif()