specification-dilemma/similarity.py

64 lines
1.8 KiB
Python

"""Compute pairwise cosine similarities within each condition."""
from __future__ import annotations
from itertools import combinations
from pathlib import Path
import numpy as np
import pandas as pd
import yaml
def load_config(path: str = "config.yaml") -> dict:
with open(path, "r") as f:
return yaml.safe_load(f)
def pairwise_cosine(embeddings: np.ndarray) -> tuple[np.ndarray, list[tuple[int, int]]]:
"""Return (similarities, index_pairs) for all i<j pairs.
Assumes embeddings are L2-normalized, so cosine = dot product.
"""
n = embeddings.shape[0]
pairs = list(combinations(range(n), 2))
sims = np.array([
float(embeddings[i] @ embeddings[j]) for i, j in pairs
])
return sims, pairs
def main() -> None:
cfg = load_config()
emb_root = Path(cfg["paths"]["embeddings_dir"])
results_root = Path(cfg["paths"]["results_dir"])
results_root.mkdir(parents=True, exist_ok=True)
rows = []
for condition in ("sparse", "dense"):
emb_path = emb_root / f"{condition}.npy"
if not emb_path.exists():
print(f"Missing embeddings for {condition}; skipping.")
continue
embeddings = np.load(emb_path)
sims, pairs = pairwise_cosine(embeddings)
for (i, j), s in zip(pairs, sims):
rows.append({
"condition": condition,
"i": i,
"j": j,
"cosine": s,
})
print(
f"{condition}: n_outputs={embeddings.shape[0]}, "
f"n_pairs={len(sims)}, mean={sims.mean():.4f}, "
f"std={sims.std(ddof=1):.4f}"
)
df = pd.DataFrame(rows)
df.to_csv(results_root / "pairwise.csv", index=False)
print(f"Saved {results_root / 'pairwise.csv'}")
if __name__ == "__main__":
main()