142 lines
4.3 KiB
Python
142 lines
4.3 KiB
Python
"""Statistical tests and summary metrics for the similarity comparison.
|
|
|
|
Reports:
|
|
- Per-condition descriptive statistics
|
|
- Naive two-sample t-test on pairwise similarities
|
|
- Mann-Whitney U (nonparametric check)
|
|
- Cohen's d effect size
|
|
- Output-level bootstrap 95% CI for the difference in mean similarity
|
|
(corrects for pairwise dependence)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from itertools import combinations
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import yaml
|
|
from scipy import stats
|
|
|
|
|
|
def load_config(path: str = "config.yaml") -> dict:
|
|
with open(path, "r") as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def cohens_d(a: np.ndarray, b: np.ndarray) -> float:
|
|
"""Pooled-SD Cohen's d for two independent samples."""
|
|
na, nb = len(a), len(b)
|
|
va, vb = a.var(ddof=1), b.var(ddof=1)
|
|
pooled_sd = np.sqrt(((na - 1) * va + (nb - 1) * vb) / (na + nb - 2))
|
|
return (a.mean() - b.mean()) / pooled_sd
|
|
|
|
|
|
def mean_pairwise(embeddings: np.ndarray) -> float:
|
|
"""Mean pairwise cosine for an L2-normalized embedding matrix."""
|
|
n = embeddings.shape[0]
|
|
sims = [
|
|
float(embeddings[i] @ embeddings[j])
|
|
for i, j in combinations(range(n), 2)
|
|
]
|
|
return float(np.mean(sims))
|
|
|
|
|
|
def bootstrap_diff(
|
|
sparse_emb: np.ndarray,
|
|
dense_emb: np.ndarray,
|
|
n_iter: int,
|
|
rng: np.random.Generator,
|
|
) -> tuple[float, float, np.ndarray]:
|
|
"""Output-level bootstrap of (mean_sparse - mean_dense).
|
|
|
|
Resamples outputs (not pairs) with replacement, recomputes mean
|
|
pairwise similarity in each condition, returns 95% CI.
|
|
"""
|
|
n_s, n_d = sparse_emb.shape[0], dense_emb.shape[0]
|
|
diffs = np.empty(n_iter, dtype=float)
|
|
for k in range(n_iter):
|
|
idx_s = rng.integers(0, n_s, size=n_s)
|
|
idx_d = rng.integers(0, n_d, size=n_d)
|
|
ms = mean_pairwise(sparse_emb[idx_s])
|
|
md = mean_pairwise(dense_emb[idx_d])
|
|
diffs[k] = ms - md
|
|
lo, hi = np.percentile(diffs, [2.5, 97.5])
|
|
return float(lo), float(hi), diffs
|
|
|
|
|
|
def main() -> None:
|
|
cfg = load_config()
|
|
emb_root = Path(cfg["paths"]["embeddings_dir"])
|
|
results_root = Path(cfg["paths"]["results_dir"])
|
|
results_root.mkdir(parents=True, exist_ok=True)
|
|
|
|
sparse_emb = np.load(emb_root / "sparse.npy")
|
|
dense_emb = np.load(emb_root / "dense.npy")
|
|
|
|
df = pd.read_csv(results_root / "pairwise.csv")
|
|
sparse_sims = df.loc[df["condition"] == "sparse", "cosine"].to_numpy()
|
|
dense_sims = df.loc[df["condition"] == "dense", "cosine"].to_numpy()
|
|
|
|
# Descriptive
|
|
desc = {
|
|
"sparse": {
|
|
"n_outputs": int(sparse_emb.shape[0]),
|
|
"n_pairs": int(len(sparse_sims)),
|
|
"mean": float(sparse_sims.mean()),
|
|
"std": float(sparse_sims.std(ddof=1)),
|
|
"median": float(np.median(sparse_sims)),
|
|
},
|
|
"dense": {
|
|
"n_outputs": int(dense_emb.shape[0]),
|
|
"n_pairs": int(len(dense_sims)),
|
|
"mean": float(dense_sims.mean()),
|
|
"std": float(dense_sims.std(ddof=1)),
|
|
"median": float(np.median(dense_sims)),
|
|
},
|
|
}
|
|
|
|
# Naive t-test (note: pairwise dependence means this is optimistic)
|
|
t_stat, t_p = stats.ttest_ind(sparse_sims, dense_sims, equal_var=False)
|
|
|
|
# Nonparametric check
|
|
u_stat, u_p = stats.mannwhitneyu(
|
|
sparse_sims, dense_sims, alternative="two-sided"
|
|
)
|
|
|
|
# Effect size
|
|
d = cohens_d(sparse_sims, dense_sims)
|
|
|
|
# Output-level bootstrap (the honest test)
|
|
rng = np.random.default_rng(cfg["analysis"]["random_seed"])
|
|
lo, hi, _ = bootstrap_diff(
|
|
sparse_emb,
|
|
dense_emb,
|
|
n_iter=cfg["analysis"]["bootstrap_iterations"],
|
|
rng=rng,
|
|
)
|
|
|
|
summary = {
|
|
"descriptive": desc,
|
|
"naive_welch_t_test": {"t": float(t_stat), "p": float(t_p)},
|
|
"mann_whitney_u": {"u": float(u_stat), "p": float(u_p)},
|
|
"cohens_d": float(d),
|
|
"bootstrap_diff_in_means": {
|
|
"point_estimate": float(sparse_sims.mean() - dense_sims.mean()),
|
|
"ci_low": lo,
|
|
"ci_high": hi,
|
|
"n_iter": int(cfg["analysis"]["bootstrap_iterations"]),
|
|
},
|
|
}
|
|
|
|
with open(results_root / "stats.json", "w") as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
# Pretty print
|
|
print(json.dumps(summary, indent=2))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|