LeVCS/crates/levcs-core/benches/gc_walk.rs

128 lines
4.6 KiB
Rust

//! GC reachability-walk throughput.
//!
//! The walk is currently inlined in the CLI's `gc` command (see
//! `levcs-cli/src/repo_cmds.rs` near the `gc` fn). For benchmarking we
//! reproduce the same loop here against a synthetic store.
//!
//! Setup builds N blobs reachable through one tree under one commit,
//! writes them to a tempdir-backed store, then times the walk from
//! that single commit ID. Setup is heavy (10K loose-object writes hits
//! the disk hard); we let criterion amortize it across samples by
//! keeping the store across the bench's iterations — the walk is
//! read-only, so re-running it is harmless.
use std::collections::HashSet;
use std::path::PathBuf;
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use levcs_core::hash::blake3_hash;
use levcs_core::object::ObjectType;
use levcs_core::{
Blob, Commit, EntryType, FileMode, ObjectId, ObjectStore, Release, Tree, TreeEntry,
};
fn tempdir(prefix: &str) -> PathBuf {
let mut p = std::env::temp_dir();
let n = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0);
p.push(format!("{prefix}-{n}-{}", std::process::id()));
std::fs::create_dir_all(&p).unwrap();
p
}
/// Populate `store` with N unique blobs and a single tree referencing
/// them all. Returns the tree's ID — the seed for the walk.
///
/// We deliberately do NOT include a commit. Commit objects are
/// signed-frame types whose `RawObject::parse` requires a signature
/// trailer; producing one needs `levcs-identity`, which isn't a dep of
/// `levcs-core`. Since the walk's cost is dominated by the per-object
/// loop body (HashSet insert + read + parse), the shape Tree → N Blobs
/// captures everything we want to characterize.
fn populate(store: &ObjectStore, n_blobs: usize) -> ObjectId {
let mut tree = Tree::new();
for i in 0..n_blobs {
let body = format!("blob-{i:08}\n").into_bytes();
let blob = Blob::new(body);
let bytes = blob.serialize();
let id = blake3_hash(&bytes);
store.write_at(id, &bytes).unwrap();
tree.entries.push(TreeEntry {
name: format!("file_{i:08}.txt"),
entry_type: EntryType::Blob,
mode: FileMode::REGULAR,
hash: id,
});
}
tree.sort_and_validate().unwrap();
let tree_bytes = tree.serialize();
let tree_id = blake3_hash(&tree_bytes);
store.write_at(tree_id, &tree_bytes).unwrap();
tree_id
}
/// The walk loop — reproduces the body of `gc` in the CLI. Returns the
/// number of reachable IDs found, just to keep the optimizer honest.
fn walk_reachable(store: &ObjectStore, root: ObjectId) -> usize {
let mut reachable: HashSet<ObjectId> = HashSet::new();
let mut stack: Vec<ObjectId> = vec![root];
while let Some(id) = stack.pop() {
if !reachable.insert(id) {
continue;
}
if let Ok(raw) = store.read_object(id) {
match raw.object_type {
ObjectType::Tree => {
if let Ok(t) = Tree::parse_body(&raw.body) {
for e in t.entries {
stack.push(e.hash);
}
}
}
ObjectType::Commit => {
if let Ok(c) = Commit::parse_body(&raw.body) {
stack.push(c.tree);
stack.extend(c.parents);
}
}
ObjectType::Release => {
if let Ok(r) = Release::parse_body(&raw.body) {
stack.push(r.tree);
stack.push(r.predecessor);
}
}
ObjectType::Blob | ObjectType::Authority => {}
}
}
}
reachable.len()
}
fn bench_walk(c: &mut Criterion) {
let mut g = c.benchmark_group("gc_reachability_walk");
// Reduce sample count for the heavy 10K case — setup is already
// expensive and we don't need tight statistical bounds on a baseline.
g.sample_size(20);
for &n in &[1000usize, 10_000] {
let dir = tempdir(&format!("levcs-bench-gc-{n}"));
let store = ObjectStore::new(dir.clone());
store.ensure_dirs().unwrap();
let root = populate(&store, n);
g.bench_with_input(
BenchmarkId::from_parameter(format!("{n}_objects")),
&(),
|b, _| b.iter(|| black_box(walk_reachable(&store, root))),
);
let _ = std::fs::remove_dir_all(dir);
}
g.finish();
}
criterion_group!(benches, bench_walk);
criterion_main!(benches);