Source code for bonni.model.mlp

from dataclasses import dataclass
from flax import linen as nn
import jax
from flax.linen import initializers

from bonni.model.utils import ActivationType, InitType, get_activation_fn, get_init_fn


class MLPLayer(nn.Module):
    out_channels: int
    activation_type: ActivationType
    norm_groups: int | None
    dropout_prob: float = 0.0
    bias_init: InitType = InitType.zeros
    skip_if_possible: bool = False

    def setup(self):
        bias_init_fn = get_init_fn(self.bias_init)
        self.fc1 = nn.Dense(
            features=self.out_channels,
            bias_init=bias_init_fn,
            kernel_init=initializers.he_normal(),
        )
        self.norm = None
        if self.norm_groups is not None:
            self.norm = nn.GroupNorm(
                num_groups=self.norm_groups,
                epsilon=1e-5,  # Default epsilon value
            )
        self.activation = get_activation_fn(self.activation_type)
        self.dropout = None
        if self.dropout_prob > 0:
            self.dropout = nn.Dropout(
                rate=self.dropout_prob,
            )

    def __call__(
        self,
        x: jax.Array,
        deterministic: bool = False,
    ) -> jax.Array:
        # Apply first linear transformation
        y = self.fc1(x)
        # group norm
        if self.norm is not None:
            y = self.norm(y)
        # Apply activation
        y = self.activation(y)
        if self.skip_if_possible and x.shape == y.shape:
            y = x + y
        # Dropout
        if self.dropout is not None:
            y = self.dropout(y, deterministic=deterministic)
        return y



[docs]
@dataclass(frozen=True, kw_only=True)
class MLPModelConfig:
    """
    Configuration object for a Multi-Layer Perceptron (MLP) model.

    This dataclass defines the structural and hyperparameter settings for an MLP,
    including layer dimensions, normalization, dropout, and activation strategies.
    It is frozen (immutable) and requires keyword arguments for initialization.

    Attributes:
        num_layer (int): The total number of linear layers in the MLP.
        out_channels (int): The dimensionality of the output features.
        hidden_channels (int | None): The dimensionality of the hidden layers.
            If None, this is typically inferred from the input or output channels
            depending on the implementation. Defaults to None.
        norm_groups (int | None): The number of groups to use for Group Normalization
            in the hidden layers. If None, normalization is skipped. Defaults to None.
        last_norm_groups (int | None): The number of groups for Group Normalization
            applied to the final layer. If None, no normalization is applied to the
            output. Defaults to None.
        dropout_prob (float): The dropout probability applied after hidden layers.
            Must be between 0.0 and 1.0. Defaults to 0.0.
        last_dropout_prob (float): The dropout probability applied after the final layer.
            Defaults to 0.0.
        activation_type (ActivationType): The activation function used after hidden layers.
            Defaults to ActivationType.gelu.
        different_last_activation (ActivationType | None): The activation function
            used after the final layer. If set to `ActivationType.identity`, the output
            is linear. If None, the model typically uses the same activation as
            `activation_type`. Defaults to ActivationType.identity.
        bias_init (InitType): The initialization strategy for the layer biases
            (e.g., zeros, uniform). Defaults to InitType.zeros.
        skip_if_possible (bool): If True, adds residual connections (skip connections)
            around layers where the input and output dimensions are identical.
            Defaults to True.
    """

    num_layer: int
    out_channels: int
    hidden_channels: int | None = None
    norm_groups: int | None = None
    last_norm_groups: int | None = None
    dropout_prob: float = 0.0
    last_dropout_prob: float = 0.0
    activation_type: ActivationType = ActivationType.gelu
    different_last_activation: ActivationType | None = (
        ActivationType.identity
    )  # if none, use same activation
    bias_init: InitType = InitType.zeros
    skip_if_possible: bool = True



class MLP(nn.Module):
    cfg: MLPModelConfig

    def setup(self):
        if self.cfg.num_layer > 1:
            assert self.cfg.hidden_channels is not None, "need hidden dim with >1 layer"

        layers = []
        for idx in range(self.cfg.num_layer):
            # select activation
            cur_activ = (
                self.cfg.activation_type
                if idx != self.cfg.num_layer - 1
                or self.cfg.different_last_activation is None
                else self.cfg.different_last_activation
            )
            # select out channels
            cur_out_channels = self.cfg.out_channels
            if idx != self.cfg.num_layer - 1:
                assert self.cfg.hidden_channels is not None
                cur_out_channels = self.cfg.hidden_channels
            # select dropout prob, norm_groups
            dropout_prob = (
                self.cfg.dropout_prob
                if idx != self.cfg.num_layer - 1
                else self.cfg.last_dropout_prob
            )
            norm_groups = (
                self.cfg.norm_groups
                if idx != self.cfg.num_layer - 1
                else self.cfg.last_norm_groups
            )
            cur_skip_if_possible = (
                self.cfg.skip_if_possible if idx != self.cfg.num_layer - 1 else False
            )
            # build current layer
            cur_layer = MLPLayer(
                out_channels=cur_out_channels,
                activation_type=cur_activ,
                norm_groups=norm_groups,
                dropout_prob=dropout_prob,
                bias_init=self.cfg.bias_init,
                skip_if_possible=cur_skip_if_possible,
            )
            layers.append(cur_layer)
        self.layers = layers

    def __call__(
        self,
        x: jax.Array,
        deterministic: bool = False,
    ) -> jax.Array:
        for layer in self.layers:
            x = layer(x, deterministic=deterministic)
        return x