cmake_minimum_required(VERSION 3.20) project(pqc-bench C ASM) set(CMAKE_C_STANDARD 11) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() set(KYBER_ROOT ${CMAKE_SOURCE_DIR}/../algorithms/kyber) # ── Helpers shared across variants ────────────────────────────────────────── # cpucycles / speed_print live in the kyber ref test dir; both variants use # the same copies (avx2/test/ has identical files). set(BENCH_HELPERS ${KYBER_ROOT}/ref/test/cpucycles.c ${KYBER_ROOT}/ref/test/speed_print.c ) set(HARNESS_SRC ${CMAKE_SOURCE_DIR}/src/test_speed.c) # ── ref sources (pure C, portable) ────────────────────────────────────────── set(REF_DIR ${KYBER_ROOT}/ref) set(REF_SOURCES ${REF_DIR}/kem.c ${REF_DIR}/indcpa.c ${REF_DIR}/polyvec.c ${REF_DIR}/poly.c ${REF_DIR}/ntt.c ${REF_DIR}/cbd.c ${REF_DIR}/reduce.c ${REF_DIR}/verify.c ${REF_DIR}/fips202.c ${REF_DIR}/symmetric-shake.c ${REF_DIR}/randombytes.c ) # ── avx2 sources (C + x86 assembly) ───────────────────────────────────────── set(AVX2_DIR ${KYBER_ROOT}/avx2) set(AVX2_SOURCES ${AVX2_DIR}/kem.c ${AVX2_DIR}/indcpa.c ${AVX2_DIR}/polyvec.c ${AVX2_DIR}/poly.c ${AVX2_DIR}/cbd.c ${AVX2_DIR}/verify.c ${AVX2_DIR}/fips202.c ${AVX2_DIR}/fips202x4.c ${AVX2_DIR}/symmetric-shake.c ${AVX2_DIR}/randombytes.c ${AVX2_DIR}/consts.c ${AVX2_DIR}/rejsample.c ${AVX2_DIR}/fq.S ${AVX2_DIR}/shuffle.S ${AVX2_DIR}/ntt.S ${AVX2_DIR}/invntt.S ${AVX2_DIR}/basemul.S ${AVX2_DIR}/keccak4x/KeccakP-1600-times4-SIMD256.c ) # ── KYBER_K mapping ────────────────────────────────────────────────────────── # 512 → K=2, 768 → K=3, 1024 → K=4 set(KYBER_K_512 2) set(KYBER_K_768 3) set(KYBER_K_1024 4) # ── Build targets ──────────────────────────────────────────────────────────── foreach(LEVEL 512 768 1024) set(K ${KYBER_K_${LEVEL}}) # ref — optimised reference (O3, auto-vectorisation enabled) set(REF_TARGET bench_mlkem${LEVEL}_ref) add_executable(${REF_TARGET} ${HARNESS_SRC} ${REF_SOURCES} ${BENCH_HELPERS} ) target_include_directories(${REF_TARGET} PRIVATE ${REF_DIR} ${REF_DIR}/test ) target_compile_definitions(${REF_TARGET} PRIVATE KYBER_K=${K}) target_compile_options(${REF_TARGET} PRIVATE -O3 -fomit-frame-pointer) # refnv — ref with auto-vectorisation disabled; isolates scalar O3 performance set(REFNV_TARGET bench_mlkem${LEVEL}_refnv) add_executable(${REFNV_TARGET} ${HARNESS_SRC} ${REF_SOURCES} ${BENCH_HELPERS} ) target_include_directories(${REFNV_TARGET} PRIVATE ${REF_DIR} ${REF_DIR}/test ) target_compile_definitions(${REFNV_TARGET} PRIVATE KYBER_K=${K}) target_compile_options(${REFNV_TARGET} PRIVATE -O3 -fomit-frame-pointer -fno-tree-vectorize ) # refo0 — ref at -O0; establishes unoptimised baseline set(REFO0_TARGET bench_mlkem${LEVEL}_refo0) add_executable(${REFO0_TARGET} ${HARNESS_SRC} ${REF_SOURCES} ${BENCH_HELPERS} ) target_include_directories(${REFO0_TARGET} PRIVATE ${REF_DIR} ${REF_DIR}/test ) target_compile_definitions(${REFO0_TARGET} PRIVATE KYBER_K=${K}) target_compile_options(${REFO0_TARGET} PRIVATE -O0) # avx2 — hand-written AVX2 assembly + O3 set(AVX2_TARGET bench_mlkem${LEVEL}_avx2) add_executable(${AVX2_TARGET} ${HARNESS_SRC} ${AVX2_SOURCES} ${BENCH_HELPERS} ) target_include_directories(${AVX2_TARGET} PRIVATE ${AVX2_DIR} ${AVX2_DIR}/test ${AVX2_DIR}/keccak4x ) target_compile_definitions(${AVX2_TARGET} PRIVATE KYBER_K=${K}) target_compile_options(${AVX2_TARGET} PRIVATE -O3 -fomit-frame-pointer -mavx2 -mbmi2 -mpopcnt -march=native -mtune=native ) endforeach() # ── PAPI (hardware performance counters) ───────────────────────────────────── option(WITH_PAPI "Link against PAPI for hardware counter collection" OFF) if(WITH_PAPI) find_library(PAPI_LIB papi REQUIRED) find_path(PAPI_INCLUDE papi.h REQUIRED) foreach(LEVEL 512 768 1024) foreach(VARIANT ref refnv refo0 avx2) set(T bench_mlkem${LEVEL}_${VARIANT}) target_include_directories(${T} PRIVATE ${PAPI_INCLUDE}) target_link_libraries(${T} ${PAPI_LIB}) target_compile_definitions(${T} PRIVATE WITH_PAPI) endforeach() endforeach() endif() # ── RAPL energy measurement ────────────────────────────────────────────────── # Requires root or CAP_SYS_RAWIO on the benchmark node. option(WITH_RAPL "Enable Intel RAPL energy measurement" OFF) if(WITH_RAPL) foreach(LEVEL 512 768 1024) foreach(VARIANT ref refnv refo0 avx2) target_compile_definitions(bench_mlkem${LEVEL}_${VARIANT} PRIVATE WITH_RAPL) endforeach() endforeach() endif()