"""opsd_loss.py — Self-distillation loss, lifted from siyan-zhao/OPSD.

Original source: github.com/siyan-zhao/OPSD::OPSDTrainer.generalized_jsd_loss (MIT).
Verified self-contained via DeepWiki audit on 2026-05-25.
Re-aligned byte-for-byte against upstream `opsd_trainer.py` lines 381-479 on
2026-05-26 after Wave 15 math review found three numerical divergences (mixture
weighting, β coefficient placement, reduction divisor) and one docstring mislabel.

Mathematical reference:
- OPSD paper: Zhao et al., "Self-Distilled Reasoner: On-Policy Self-Distillation
  for LLMs", arXiv:2601.18734.
- SDPO paper: Hübotter et al., "Reinforcement Learning via Self-Distillation",
  arXiv:2601.20802 (formalizes the same loss as Composer 2.5's "Targeted RL with
  Textual Feedback").

The loss computes JSD/KL divergence between a teacher distribution (model
conditioned on privileged information / a hint) and a student distribution
(model on the original context). Both come from the SAME model — the teacher
is just "the model with hint inserted into context."

Composer 2.5 uses this with the privileged information being a "hint" inserted
at the error-turn site. We use the same loss; the data collator constructs
ctx_teacher = ctx_student + hint_at_error_turn for us.
"""

from __future__ import annotations

import torch
import torch.nn.functional as F


def generalized_jsd_loss(
    student_logits: torch.Tensor,
    teacher_logits: torch.Tensor,
    labels: torch.Tensor | None = None,
    beta: float = 0.5,
    temperature: float = 1.0,
    reduction: str = "batchmean",
    logits_are_probs: bool = False,
    top_k: int | None = None,
    token_clip: float | None = None,
) -> torch.Tensor:
    """Generalized Jensen-Shannon Divergence loss between student and teacher.

    Byte-for-byte replication of `OPSDTrainer.generalized_jsd_loss`
    (siyan-zhao/OPSD, opsd_trainer.py lines 381-479). See
    https://huggingface.co/papers/2306.13649 Eq. (1) for the definition.

    Args:
        student_logits: (B, T, V) — student model logits at each token position.
        teacher_logits: (B, T, V) — teacher (= same model with hint context) logits.
        labels: (B, T) — token-level mask. Positions with label == -100 are ignored
            (standard HF padding/ignored convention). For Composer-style hint-distill,
            mask should be 1 at error-turn tokens AFTER the hint, 0 elsewhere.
        beta: in [0, 1]. NOTE on direction (per `F.kl_div` semantics, where
            `F.kl_div(log_q, log_p, log_target=True)` computes KL(p || q)):
              β = 0  → kl_div(student_log_probs, teacher_log_probs)
                    = KL(teacher || student)  (reverse KL — mode-covering for student)
              β = 1  → kl_div(teacher_log_probs, student_log_probs)
                    = KL(student || teacher)  (forward KL — mode-seeking for student)
              β = 0.5 → symmetric JSD with M = 0.5*(P+Q)
            General β ∈ (0,1): mixture M = (1-β)·P_student + β·P_teacher and
            jsd = β·KL(teacher||M) + (1-β)·KL(student||M).
        temperature: softens distributions; T > 1 encourages distribution-matching
            on broader tail probabilities. SDPO paper uses 1.0.
        reduction: "batchmean" | "sum" | "mean" | "none". "batchmean" matches
            upstream OPSD: divides by `mask.sum()` when labels are given, else
            by the leading dim of jsd (= batch size). This differs from PyTorch's
            `KLDivLoss(reduction='batchmean')` (which divides by batch). We match
            upstream because gradient scale stability matters more than the name.
        logits_are_probs: if True, inputs are already probabilities (skip softmax).
        top_k: restrict KL to top-k tokens of the teacher distribution.
            Saves compute on large vocabularies (Qwen3 vocab = 152K).
        token_clip: clip per-token JSD to this max. Stabilizes training.
            SDPO paper does NOT clip; OPSD code defaults to None (no clip).

    Returns:
        Scalar loss tensor (or unreduced (B, T, V) tensor for reduction="none").
    """
    # Path A: probabilities-in. Take log directly with a clamp for stability.
    if logits_are_probs:
        student_log_probs = torch.log(student_logits.clamp_min(1e-8))
        teacher_log_probs = torch.log(teacher_logits.clamp_min(1e-8))
    else:
        # Apply temperature scaling to logits before computing probabilities.
        student_logits = student_logits / temperature
        teacher_logits = teacher_logits / temperature

        if top_k is not None and top_k > 0:
            # Restrict to top-k tokens of the teacher distribution and renormalize.
            _, top_k_indices = torch.topk(teacher_logits, k=top_k, dim=-1)
            student_logits = torch.gather(student_logits, dim=-1, index=top_k_indices)
            teacher_logits = torch.gather(teacher_logits, dim=-1, index=top_k_indices)

        student_log_probs = F.log_softmax(student_logits, dim=-1)
        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)

    if beta == 0:
        # F.kl_div(input=log_q, target=log_p, log_target=True) computes KL(p || q):
        #   sum_x p(x) * (log p(x) - log q(x))
        # With input=student_log_probs, target=teacher_log_probs → KL(teacher || student).
        jsd = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True)
    elif beta == 1:
        jsd = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
    else:
        # Compute the log of the β-weighted mixture distribution:
        #   M = (1-β)·P_student + β·P_teacher
        #   log M = logsumexp([log P_student + log(1-β), log P_teacher + log(β)])
        beta = torch.tensor(beta, dtype=student_log_probs.dtype, device=student_log_probs.device)
        mixture_log_probs = torch.logsumexp(
            torch.stack([student_log_probs + torch.log1p(-beta), teacher_log_probs + torch.log(beta)]),
            dim=0,
        )

        # Compute KL divergences using F.kl_div.
        # PyTorch differs from the standard mathematical definition, so the order of
        # the probability distributions is swapped compared to that defined in the paper.
        kl_teacher = F.kl_div(mixture_log_probs, teacher_log_probs, reduction="none", log_target=True)
        kl_student = F.kl_div(mixture_log_probs, student_log_probs, reduction="none", log_target=True)

        # Generalized JSD: β weights the teacher-leg KL (matches upstream).
        jsd = beta * kl_teacher + (1 - beta) * kl_student

    # Per-token clipping: cap each token's divergence value.
    if token_clip is not None:
        jsd = jsd.clamp(max=token_clip)

    # Masking. labels has shape (B, T); jsd has shape (B, T, V) (or top_k for V).
    # `jsd[mask]` indexes the first two dims, yielding shape (n_valid, V).
    mask = None
    if labels is not None:
        mask = labels != -100
        jsd = jsd[mask]

    # Apply reduction (matches upstream byte-for-byte for batchmean/sum/mean).
    if reduction == "batchmean":
        if labels is not None:
            assert mask is not None
            return jsd.sum() / mask.sum()
        return jsd.sum() / jsd.size(0)
    elif reduction == "sum":
        return jsd.sum()
    elif reduction == "mean":
        return jsd.mean()
    elif reduction == "none":
        return jsd
    else:
        # Upstream falls through to `return jsd` for unknown reductions; we raise
        # to surface caller bugs instead of silently returning an unreduced tensor.
        raise ValueError(f"Unknown reduction: {reduction}")


__all__ = ["generalized_jsd_loss"]