from dataclasses import dataclass
import torch
import torch.nn as nn


class ColorMLP(nn.Module):
    def __init__(self, normalize_bottleneck=False):
        super().__init__()
        # RGB input (3D) → hidden layer → bottleneck → hidden layer → RGB output
        self.encoder = nn.Sequential(
            nn.Linear(3, 16),
            nn.GELU(),
            nn.Linear(16, 3),  # Our critical bottleneck!
        )

        self.decoder = nn.Sequential(
            nn.Linear(3, 16),
            nn.GELU(),
            nn.Linear(16, 3),
            nn.Sigmoid(),  # Keep RGB values in [0,1]
        )

        self.normalize = normalize_bottleneck

    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        # Get our bottleneck representation
        bottleneck = self.encoder(x)

        # Optionally normalize to unit vectors (like nGPT)
        if self.normalize:
            norm = torch.norm(bottleneck, dim=1, keepdim=True)
            bottleneck = bottleneck / (norm + 1e-8)  # Avoid division by zero

        # Decode back to RGB
        output = self.decoder(bottleneck)
        return output, bottleneck

import numpy as np

from ex_color.data.color_cube import ColorCube
from ex_color.data.cyclic import arange_cyclic


@dataclass
class CurriculumPhase:
    name: str
    data: torch.Tensor
    epochs: int
    norm_loss_weight: float
    nonplanar_loss_weight: float


def train_color_model(model: ColorMLP, curriculum: list[CurriculumPhase], base_lr=0.001, warmup_fraction=0.1):
    from torch import optim
    from torch.optim.lr_scheduler import LinearLR
    from tqdm import tqdm

    # Create optimizer once for the entire training process
    optimizer = optim.Adam(model.parameters(), lr=base_lr)
    criterion = nn.MSELoss()

    # Store bottleneck values for visualization
    bottleneck_history = []

    total_epochs = sum(phase.epochs for phase in curriculum)

    with tqdm(total=total_epochs) as pbar:
        for phase in curriculum:
            warmup_epochs = max(1, int(phase.epochs * warmup_fraction))
            scheduler = LinearLR(
                optimizer,
                start_factor=1 / warmup_epochs,
                end_factor=1.0,
                total_iters=warmup_epochs,
            )

            for _ in range(phase.epochs):
                pbar.update(1)
                optimizer.zero_grad()

                # Forward pass
                outputs, latents = model(phase.data)

                # Reconstruction loss (main objective)
                recon_loss = criterion(outputs, phase.data)

                # Regularization losses

                # Penalize latents not being unit vectors
                norms = torch.norm(latents, dim=1)  # L2 norm of each latent
                norm_loss = torch.mean((norms - 1.0) ** 2)  # MSE to unit norm

                # Penalize non-planar latents (latents[:, 2] should be close to 0)
                nonplanar_loss = torch.mean(latents[:, 2] ** 2)  # Penalize non-planar latents

                loss = (
                    recon_loss
                    + phase.norm_loss_weight * norm_loss
                    + phase.nonplanar_loss_weight * nonplanar_loss
                )  # fmt: skip

                # Backward pass
                loss.backward()
                optimizer.step()
                scheduler.step()

                # current_lr = optimizer.param_groups[0]['lr']
                pbar.set_postfix(
                    phase=phase.name,
                    lr=f'{scheduler.get_last_lr()[0]:.6f}',
                    loss=f'{loss.item():.4f}',
                    recon=f'{recon_loss.item():.4f}',
                    norm=f'{norm_loss.item():.4f}',
                    nonplanar=f'{nonplanar_loss.item():.4f}',
                )

            with torch.no_grad():
                _, latents = model(phase.data)
                bottleneck_history.append(latents.clone())

    return bottleneck_history


# Create a model variant without explicit normalization
model = ColorMLP(normalize_bottleneck=False)

curriculum = [
    CurriculumPhase(
        name='Primary, secondary, tertiary',
        data=torch.tensor(
            ColorCube.from_hsv(
                h=arange_cyclic(step_size=1 / 12),
                s=np.array([1]),
                v=np.array([1]),
            )
            .permute('svh')
            .rgb_grid.reshape(-1, 3),
            dtype=torch.float32,
        ),
        epochs=2000,
        norm_loss_weight=0.02,
        nonplanar_loss_weight=0.1,
    ),
    CurriculumPhase(
        name='Bright hues only',
        data=torch.tensor(
            ColorCube.from_hsv(
                h=arange_cyclic(step_size=1 / 360),
                s=np.array([1]),
                v=np.array([1]),
            )
            .permute('svh')
            .rgb_grid.reshape(-1, 3),
            dtype=torch.float32,
        ),
        epochs=2000,
        norm_loss_weight=0.02,
        nonplanar_loss_weight=0.1,
    ),
    CurriculumPhase(
        name='Slightly darker hues',
        data=torch.tensor(
            ColorCube.from_hsv(
                h=arange_cyclic(step_size=1 / 360),
                s=np.array([1]),
                v=np.linspace(0.9, 1, 2),
            )
            .permute('svh')
            .rgb_grid.reshape(-1, 3),
            dtype=torch.float32,
        ),
        epochs=3000,
        norm_loss_weight=0.05,
        nonplanar_loss_weight=0.02,
    ),
    CurriculumPhase(
        name='Brightness levels extended',
        data=torch.tensor(
            ColorCube.from_hsv(
                h=arange_cyclic(step_size=1 / 360),
                s=np.array([1]),
                v=np.linspace(0.8, 1, 3),
            )
            .permute('svh')
            .rgb_grid.reshape(-1, 3),
            dtype=torch.float32,
        ),
        epochs=2000,
        norm_loss_weight=0.1,
        nonplanar_loss_weight=0.005,
    ),
    CurriculumPhase(
        name='All brightness levels',
        data=torch.tensor(
            ColorCube.from_hsv(
                h=arange_cyclic(step_size=1 / 360),
                s=np.array([1]),
                v=np.linspace(0.2, 1, 10),
            )
            .permute('svh')
            .rgb_grid.reshape(-1, 3),
            dtype=torch.float32,
        ),
        epochs=3000,
        norm_loss_weight=0.1,
        nonplanar_loss_weight=0.0,
    ),
]

# Train with regularization
history = train_color_model(model, curriculum, base_lr=0.01, warmup_fraction=0.1)

import math
import matplotlib.pyplot as plt


# Create a scatter plot with points colored by their RGB values
nrows = math.ceil(len(curriculum) / 2)
fig, axes = plt.subplots(nrows, 2, figsize=(10, 5 * nrows))
axes = axes.flatten()

for phase, latents, ax in zip(curriculum, history, axes, strict=False):
    latents = latents.cpu().numpy()
    colors = phase.data.cpu().numpy().reshape(-1, 3)
    print(latents.shape, colors.shape)
    ax.scatter(latents[:, 0], latents[:, 1], c=colors, s=10, alpha=0.8)
    ax.set_title(phase.name)
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')

# Hide any unused subplots
for ax in axes[len(curriculum) :]:
    ax.axis('off')

fig.suptitle('Color Space Embeddings')

plt.tight_layout()
plt.show()

(12, 3) (12, 3)
(360, 3) (360, 3)
(720, 3) (720, 3)
(1080, 3) (1080, 3)
(3600, 3) (3600, 3)

Phase	Training Data	Normalization	Planar Constraint
1	Pure primary, secondary, and tertiary colors	Weak	Strong
2	"All" pure hues	Moderate	Strong
3	A few slightly darker tones	Moderate	Weak
4	More slightly darker tones	Strong	Very Weak
5	Many darker tones	Strong	None

Experiment 1.3: MLP with 3D bottleneck and curriculum learning

Model: MLP with 3D bottleneck

Curriculum learning setup

Training implementation

Visualizing the bottleneck evolution