#!/usr/bin/env python3
"""Matplotlib draft figures for the PQC SIMD speedup analysis.

Usage:
    python3 analysis/figures.py [--json analysis/results.json] [--out figures/]
"""

import argparse
import json
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------

# Cumulative stages used in Figure 1 (each shows total speedup from refo0)
STAGE_KEYS    = ["refo0_to_refnv", "refo0_to_ref", "refo0_to_avx2"]
STAGE_LABELS  = ["O3, no auto-vec", "O3 + auto-vec", "O3 + hand SIMD (avx2)"]
STAGE_COLORS  = ["#4C72B0", "#55A868", "#C44E52"]

# Ops to show in the primary figures (excludes top-level KEM wrappers)
PRIMARY_OPS = {
    "poly_frommsg", "INVNTT", "polyvec_basemul_acc_montgomery", "NTT",
    "indcpa_dec", "polyvec_decompress", "poly_decompress",
    "poly_compress", "poly_tomsg", "polyvec_compress",
    "indcpa_enc", "indcpa_keypair", "gen_a",
    "poly_getnoise_eta1", "poly_getnoise_eta2",
}

# Short display names
OP_SHORT = {
    "poly_frommsg":                     "frommsg",
    "INVNTT":                           "INVNTT",
    "polyvec_basemul_acc_montgomery":   "basemul",
    "NTT":                              "NTT",
    "indcpa_dec":                       "dec",
    "polyvec_decompress":               "pvec_decomp",
    "poly_decompress":                  "poly_decomp",
    "poly_compress":                    "poly_comp",
    "poly_tomsg":                       "tomsg",
    "polyvec_compress":                 "pvec_comp",
    "indcpa_enc":                       "enc",
    "indcpa_keypair":                   "keypair",
    "gen_a":                            "gen_a",
    "poly_getnoise_eta1":               "noise_η₁",
    "poly_getnoise_eta2":               "noise_η₂",
}

ALGORITHMS = ["mlkem512", "mlkem768", "mlkem1024"]
ALG_TITLES  = {"mlkem512": "ML-KEM-512", "mlkem768": "ML-KEM-768", "mlkem1024": "ML-KEM-1024"}

# Operations selected to illustrate the distribution figure:
# one high-speedup arithmetic op, one medium SHAKE-bound op, one low-speedup op
DIST_OPS = [
    ("INVNTT",            "INVNTT\n(~55× speedup)"),
    ("gen_a",             "gen_a\n(~4× speedup)"),
    ("poly_getnoise_eta1","noise η₁\n(~1.3× speedup)"),
]

# Per-polynomial ops whose speedup should be param-independent
CROSS_PARAM_OPS = [
    "poly_frommsg",
    "INVNTT",
    "polyvec_basemul_acc_montgomery",
    "NTT",
]

# KEM-level ops for supplementary
KEM_OPS = ["kyber_keypair", "kyber_encaps", "kyber_decaps"]
KEM_SHORT = {"kyber_keypair": "KeyGen", "kyber_encaps": "Encaps", "kyber_decaps": "Decaps"}


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def load(json_path: str) -> list[dict]:
    with open(json_path) as f:
        return json.load(f)


def ops_for_alg(results: list[dict], alg: str) -> list[dict]:
    rows = [r for r in results if r["algorithm"] == alg and r["operation"] in PRIMARY_OPS]
    rows.sort(key=lambda r: -r["comparisons"].get("ref_to_avx2", {}).get("speedup", 0))
    return rows


# ---------------------------------------------------------------------------
# Figure 1: cumulative grouped bars — speedup at each optimisation stage
#
# Each group shows three bars for one operation:
#   refo0→refnv   total speedup with O3, auto-vec OFF
#   refo0→ref     total speedup with O3, auto-vec ON
#   refo0→avx2    total speedup with O3 + hand-written SIMD
#
# Because all bars share the same baseline (refo0=1), they are directly
# comparable without any additive/multiplicative ambiguity.
# ---------------------------------------------------------------------------

def fig_decomposition(results: list[dict], out_dir: Path) -> None:
    fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=False)

    for ax, alg in zip(axes, ALGORITHMS):
        rows = ops_for_alg(results, alg)
        if not rows:
            ax.set_visible(False)
            continue

        ops   = [OP_SHORT.get(r["operation"], r["operation"]) for r in rows]
        n     = len(rows)
        group = np.arange(n)
        # Three bars per group, evenly spaced within each group slot
        bar_w = 0.22
        offsets = np.array([-bar_w, 0, bar_w])

        for (key, label, color), offset in zip(
            zip(STAGE_KEYS, STAGE_LABELS, STAGE_COLORS), offsets
        ):
            vals  = np.array([r["comparisons"].get(key, {}).get("speedup", 0.0) for r in rows])
            ci_lo = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[0] for r in rows])
            ci_hi = np.array([r["comparisons"].get(key, {}).get("ci95", [0.0, 0.0])[1] for r in rows])
            yerr  = np.array([vals - ci_lo, ci_hi - vals])
            mask  = vals > 0

            ax.bar(group[mask] + offset, vals[mask], bar_w,
                   label=label, color=color, alpha=0.88, zorder=3)
            ax.errorbar(group[mask] + offset, vals[mask], yerr=yerr[:, mask],
                        fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4)

        ax.set_yscale("log")
        ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×"))
        ax.set_title(ALG_TITLES[alg], fontsize=12, fontweight="bold")
        ax.set_xticks(group)
        ax.set_xticklabels(ops, rotation=45, ha="right", fontsize=8)
        ax.set_ylabel("Speedup over -O0 (×, log scale)" if alg == "mlkem512" else "")
        ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
        ax.set_axisbelow(True)
        ax.set_xlim(-0.5, n - 0.5)

    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc="upper center", ncol=3,
               fontsize=10, frameon=True, bbox_to_anchor=(0.5, 1.02))
    fig.suptitle(
        "ML-KEM Cumulative Speedup at Each Optimisation Stage  "
        "(Intel Xeon Platinum 8268, 95% bootstrap CI)",
        fontsize=11, y=1.06,
    )
    fig.tight_layout()
    _save(fig, out_dir, "decomposition")


# ---------------------------------------------------------------------------
# Figure 2: hand-SIMD speedup (ref→avx2), all algorithms overlaid, log scale
# ---------------------------------------------------------------------------

def fig_hand_simd(results: list[dict], out_dir: Path) -> None:
    all_ops: dict[str, dict] = {}
    for r in results:
        if r["operation"] in PRIMARY_OPS and "ref_to_avx2" in r["comparisons"]:
            all_ops.setdefault(r["operation"], {})
            all_ops[r["operation"]][r["algorithm"]] = r["comparisons"]["ref_to_avx2"]

    ops_sorted = sorted(
        all_ops,
        key=lambda op: -all_ops[op].get("mlkem512", {}).get("speedup", 0),
    )
    short_ops = [OP_SHORT.get(op, op) for op in ops_sorted]

    x       = np.arange(len(ops_sorted))
    bar_w   = 0.25
    offsets = [-bar_w, 0, bar_w]
    colors  = ["#4C72B0", "#55A868", "#C44E52"]

    fig, ax = plt.subplots(figsize=(14, 5))

    for alg, offset, color in zip(ALGORITHMS, offsets, colors):
        vals  = np.array([all_ops[op].get(alg, {}).get("speedup", 0) for op in ops_sorted])
        ci_lo = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[0] for op in ops_sorted])
        ci_hi = np.array([all_ops[op].get(alg, {}).get("ci95", [0, 0])[1] for op in ops_sorted])
        yerr  = np.array([vals - ci_lo, ci_hi - vals])
        mask  = vals > 0

        ax.bar(x[mask] + offset, vals[mask], bar_w,
               label=ALG_TITLES[alg], color=color, alpha=0.85, zorder=3)
        ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
                    fmt="none", ecolor="black", elinewidth=0.7, capsize=2, zorder=4)

    ax.set_yscale("log")
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda v, _: f"{v:g}×"))
    ax.set_xticks(x)
    ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9)
    ax.set_ylabel("Speedup ref → avx2 (×, log scale)")
    ax.set_title(
        "Hand-Written SIMD Speedup over Compiler-Optimised C\n"
        "(Intel Xeon Platinum 8268, 95% bootstrap CI, n≥2000 per group)"
    )
    ax.grid(axis="y", which="both", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
    ax.set_axisbelow(True)
    ax.legend(fontsize=10)

    fig.tight_layout()
    _save(fig, out_dir, "hand_simd_speedup")


# ---------------------------------------------------------------------------
# Figure 3: Cliff's delta heatmap (ref→avx2)
# ---------------------------------------------------------------------------

def fig_cliffs_heatmap(results: list[dict], out_dir: Path) -> None:
    ops_set = sorted(
        {r["operation"] for r in results if "ref_to_avx2" in r["comparisons"]},
        key=lambda op: -max(
            r["comparisons"]["ref_to_avx2"]["cliffs_delta"]
            for r in results
            if r["operation"] == op and "ref_to_avx2" in r["comparisons"]
        ),
    )
    short_ops = [OP_SHORT.get(op, op) for op in ops_set]

    data = np.full((len(ALGORITHMS), len(ops_set)), np.nan)
    for i, alg in enumerate(ALGORITHMS):
        for j, op in enumerate(ops_set):
            match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
            if match and "ref_to_avx2" in match[0]["comparisons"]:
                data[i, j] = match[0]["comparisons"]["ref_to_avx2"]["cliffs_delta"]

    n_ops = len(ops_set)
    fig, ax = plt.subplots(figsize=(max(10, n_ops * 0.85), 3.2))
    im = ax.imshow(data, aspect="auto", cmap="RdYlGn", vmin=-1, vmax=1)
    plt.colorbar(im, ax=ax, label="Cliff's δ", fraction=0.03, pad=0.02)

    ax.set_yticks(range(len(ALGORITHMS)))
    ax.set_yticklabels([ALG_TITLES[a] for a in ALGORITHMS], fontsize=10)
    ax.set_xticks(range(n_ops))
    ax.set_xticklabels(short_ops, rotation=45, ha="right", fontsize=9)
    ax.set_title(
        "Cliff's δ  (ref vs. avx2)     δ = +1.00: avx2 strictly faster in every observation pair",
        fontsize=10,
    )

    for i in range(len(ALGORITHMS)):
        for j in range(n_ops):
            if not np.isnan(data[i, j]):
                # White text on dark green cells, black elsewhere
                text_color = "white" if data[i, j] > 0.85 else "black"
                ax.text(j, i, f"{data[i, j]:+.3f}", ha="center", va="center",
                        fontsize=9, color=text_color, fontweight="bold")

    fig.tight_layout()
    _save(fig, out_dir, "cliffs_delta_heatmap")


# ---------------------------------------------------------------------------
# Figure 4: cycle distribution overlays (requires raw aggregator JSON)
#
# Three panels: one high-speedup op, one medium, one low.
# Each panel overlays ref and avx2 histograms + KDE for mlkem512.
# Log x-axis exposes the scale difference honestly.
# ---------------------------------------------------------------------------

def fig_distributions(raw_records: list[dict], out_dir: Path, alg: str = "mlkem512") -> None:
    from scipy.stats import gaussian_kde

    # Build lookup: (alg, variant, op) → raw array
    raw: dict[tuple, np.ndarray] = {}
    for r in raw_records:
        if r.get("raw"):
            raw[(r["algorithm"], r["variant"], r["operation"])] = np.array(r["raw"], dtype=np.float64)

    n_ops = len(DIST_OPS)
    fig, axes = plt.subplots(1, n_ops, figsize=(5 * n_ops, 4))

    variant_style = {
        "ref":  {"color": "#4C72B0", "label": "ref (O3)",   "alpha": 0.55, "zorder": 2},
        "avx2": {"color": "#C44E52", "label": "avx2",       "alpha": 0.65, "zorder": 3},
    }

    for ax, (op, subtitle) in zip(axes, DIST_OPS):
        plotted_any = False
        for variant in ("ref", "avx2"):
            arr = raw.get((alg, variant, op))
            if arr is None:
                continue
            plotted_any = True
            s = variant_style[variant]

            # Histogram on log scale
            log_arr = np.log10(arr)
            lo, hi = np.floor(log_arr.min()), np.ceil(log_arr.max())
            bins = np.logspace(lo, hi, 60)
            ax.hist(arr, bins=bins, density=True, color=s["color"],
                    alpha=s["alpha"], zorder=s["zorder"], label=s["label"])

            # KDE on log scale, back-transformed
            kde = gaussian_kde(log_arr, bw_method=0.25)
            xs_log = np.linspace(lo, hi, 400)
            xs = 10 ** xs_log
            # KDE is in log space; convert density: p(x) = p(log x) / (x ln10)
            ys = kde(xs_log) / (xs * np.log(10))
            ax.plot(xs, ys, color=s["color"], linewidth=1.8, zorder=s["zorder"] + 1)

            # Median line
            med = float(np.median(arr))
            ax.axvline(med, color=s["color"], linewidth=1.2, linestyle="--", zorder=5)

        if not plotted_any:
            ax.set_visible(False)
            continue

        ax.set_xscale("log")
        ax.set_xlabel("Cycles (log scale)")
        ax.set_ylabel("Density" if op == DIST_OPS[0][0] else "")
        ax.set_title(subtitle, fontsize=10)
        ax.legend(fontsize=9)
        ax.xaxis.set_major_formatter(ticker.LogFormatterSciNotation(labelOnlyBase=False))
        ax.grid(axis="x", which="both", linestyle="--", linewidth=0.4, alpha=0.4)
        ax.set_axisbelow(True)

    fig.suptitle(
        f"Cycle Count Distributions — ref vs. avx2  ({ALG_TITLES[alg]})\n"
        "Dashed lines show medians. Distributions are right-skewed → nonparametric statistics.",
        fontsize=10,
    )
    fig.tight_layout()
    _save(fig, out_dir, "distributions")


# ---------------------------------------------------------------------------
# Figure 5: cross-param speedup consistency
#
# For per-polynomial operations the polynomial dimension is always 256,
# independent of the security parameter k. Speedups should be identical
# across mlkem512/768/1024. This figure verifies that.
# ---------------------------------------------------------------------------

def fig_cross_param(results: list[dict], out_dir: Path) -> None:
    ops = CROSS_PARAM_OPS
    short = [OP_SHORT.get(op, op) for op in ops]
    x = np.arange(len(ops))
    bar_w = 0.22
    offsets = np.array([-bar_w, 0, bar_w])
    colors = ["#4C72B0", "#55A868", "#C44E52"]

    fig, ax = plt.subplots(figsize=(8, 4))

    for alg, offset, color in zip(ALGORITHMS, offsets, colors):
        vals, ci_lo, ci_hi = [], [], []
        for op in ops:
            match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
            if match and "ref_to_avx2" in match[0]["comparisons"]:
                c = match[0]["comparisons"]["ref_to_avx2"]
                vals.append(c["speedup"])
                ci_lo.append(c["ci95"][0])
                ci_hi.append(c["ci95"][1])
            else:
                vals.append(0); ci_lo.append(0); ci_hi.append(0)

        vals   = np.array(vals)
        ci_lo  = np.array(ci_lo)
        ci_hi  = np.array(ci_hi)
        yerr   = np.array([vals - ci_lo, ci_hi - vals])
        mask   = vals > 0

        ax.bar(x[mask] + offset, vals[mask], bar_w,
               label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3)
        ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
                    fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4)

    ax.set_xticks(x)
    ax.set_xticklabels(short, fontsize=11)
    ax.set_ylabel("Speedup ref → avx2 (×)")
    ax.set_title(
        "Per-Polynomial Operation Speedup Across Security Parameters\n"
        "(polynomial dim = 256 for all; NTT variation attributed to cache-state differences)"
    )
    ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
    ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
    ax.set_axisbelow(True)
    ax.legend(fontsize=10)

    fig.tight_layout()
    _save(fig, out_dir, "cross_param")


# ---------------------------------------------------------------------------
# Figure S1: KEM-level end-to-end speedup (supplementary)
# ---------------------------------------------------------------------------

def fig_kem_level(results: list[dict], out_dir: Path) -> None:
    ops = KEM_OPS
    short = [KEM_SHORT[op] for op in ops]
    x = np.arange(len(ops))
    bar_w = 0.22
    offsets = np.array([-bar_w, 0, bar_w])
    colors = ["#4C72B0", "#55A868", "#C44E52"]

    fig, ax = plt.subplots(figsize=(7, 4))

    for alg, offset, color in zip(ALGORITHMS, offsets, colors):
        vals, ci_lo, ci_hi = [], [], []
        for op in ops:
            match = [r for r in results if r["algorithm"] == alg and r["operation"] == op]
            if match and "ref_to_avx2" in match[0]["comparisons"]:
                c = match[0]["comparisons"]["ref_to_avx2"]
                vals.append(c["speedup"])
                ci_lo.append(c["ci95"][0])
                ci_hi.append(c["ci95"][1])
            else:
                vals.append(0); ci_lo.append(0); ci_hi.append(0)

        vals  = np.array(vals)
        ci_lo = np.array(ci_lo)
        ci_hi = np.array(ci_hi)
        yerr  = np.array([vals - ci_lo, ci_hi - vals])
        mask  = vals > 0

        ax.bar(x[mask] + offset, vals[mask], bar_w,
               label=ALG_TITLES[alg], color=color, alpha=0.88, zorder=3)
        ax.errorbar(x[mask] + offset, vals[mask], yerr=yerr[:, mask],
                    fmt="none", ecolor="black", elinewidth=0.8, capsize=3, zorder=4)

    ax.set_xticks(x)
    ax.set_xticklabels(short, fontsize=12)
    ax.set_ylabel("Speedup ref → avx2 (×)")
    ax.set_title(
        "End-to-End KEM Speedup (ref → avx2)\n"
        "(Intel Xeon Platinum 8268, 95% bootstrap CI)"
    )
    ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
    ax.grid(axis="y", linestyle="--", linewidth=0.4, alpha=0.5, zorder=0)
    ax.set_axisbelow(True)
    ax.legend(fontsize=10)

    fig.tight_layout()
    _save(fig, out_dir, "kem_level")


# ---------------------------------------------------------------------------
# Shared save helper
# ---------------------------------------------------------------------------

def _save(fig: plt.Figure, out_dir: Path, stem: str) -> None:
    fig.savefig(out_dir / f"{stem}.pdf", bbox_inches="tight")
    fig.savefig(out_dir / f"{stem}.png", bbox_inches="tight", dpi=150)
    print(f"Saved {out_dir}/{stem}.{{pdf,png}}")
    plt.close(fig)


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--json",     default="analysis/results.json",
                        help="analyzed results JSON (from analyze.py)")
    parser.add_argument("--raw-json", default=None,
                        help="raw aggregator JSON (from aggregate --raw); required for --distributions")
    parser.add_argument("--out",      default="analysis/figures")
    args = parser.parse_args()

    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)

    results = load(args.json)
    print(f"Loaded {len(results)} result rows.")

    fig_decomposition(results, out_dir)
    fig_hand_simd(results, out_dir)
    fig_cliffs_heatmap(results, out_dir)
    fig_cross_param(results, out_dir)
    fig_kem_level(results, out_dir)

    if args.raw_json:
        raw_records = load(args.raw_json)
        print(f"Loaded {len(raw_records)} raw groups for distributions.")
        fig_distributions(raw_records, out_dir)
    else:
        print("Skipping distributions figure (pass --raw-json to enable).")


if __name__ == "__main__":
    main()