LayerNorm(Layer Normalization)

对每个样本的最后一个维度(通常是 hidden_size)进行归一化,计算公式如下:

对于输入 $x \in \mathbb{R}^H$($H$ 是 hidden_size):

1. 计算均值和方差

$$

\mu = \frac{1}{H} \sum_{i=1}^H x_i

$$

$$

\sigma^2 = \frac{1}{H} \sum_{i=1}^H (x_i - \mu)^2

$$

2. 归一化

$$

\hat{x}_i = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}}

$$

3. 缩放和平移

$$

y_i = \gamma_i \cdot \hat{x}_i + \beta_i

$$

其中:

  • $\epsilon$ 是防止除零的小常数
  • $\gamma$ 和 $\beta$ 是可学习的参数(与 hidden_size 同维度)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import torch
import torch.nn as nn
import numpy as np

class LayerNorm:
    """手动实现 LayerNorm"""
    def __init__(self, hidden_size, eps=1e-5):
        self.hidden_size = hidden_size
        self.eps = eps
        
        # 可学习参数
        self.gamma = np.ones(hidden_size, dtype=np.float32)
        self.beta = np.zeros(hidden_size, dtype=np.float32)
        
        # 梯度
        self.gamma_grad = np.zeros_like(self.gamma)
        self.beta_grad = np.zeros_like(self.beta)
    
    def forward(self, x):
        """
        前向传播
        Args:
            x: 输入张量, shape (batch_size, seq_len, hidden_size)
        Returns:
            output: 归一化后的张量
        """
        self.x = x
        batch_size, seq_len, hidden_size = x.shape
        
        # 重塑以便计算
        x_reshaped = x.reshape(-1, hidden_size)
        
        # 计算均值和方差
        self.mean = np.mean(x_reshaped, axis=-1, keepdims=True)  # (batch_size*seq_len, 1)
        x_centered = x_reshaped - self.mean
        self.var = np.var(x_reshaped, axis=-1, keepdims=True)  # (batch_size*seq_len, 1)
        
        # 归一化
        self.x_norm = x_centered / np.sqrt(self.var + self.eps)  # (batch_size*seq_len, hidden_size)
        
        # 缩放和平移
        output = self.x_norm * self.gamma + self.beta
        
        # 恢复原始形状
        return output.reshape(batch_size, seq_len, hidden_size)
    
    def backward(self, grad_output):
        """
        反向传播
        Args:
            grad_output: 输出梯度, shape (batch_size, seq_len, hidden_size)
        """
        batch_size, seq_len, hidden_size = grad_output.shape
        grad_output_reshaped = grad_output.reshape(-1, hidden_size)  # (batch_size*seq_len, hidden_size)
        x_norm = self.x_norm  # (batch_size*seq_len, hidden_size)
        
        # 计算 beta 的梯度
        self.beta_grad = np.sum(grad_output_reshaped, axis=0)
        
        # 计算 gamma 的梯度
        self.gamma_grad = np.sum(grad_output_reshaped * x_norm, axis=0)
        
        # 计算输入梯度
        N = hidden_size
        
        # 对 x_norm 的梯度
        dx_norm = grad_output_reshaped * self.gamma  # (batch_size*seq_len, hidden_size)
        
        # 对 var 的梯度
        dvar = np.sum(dx_norm * (self.x.reshape(-1, hidden_size) - self.mean) * 
                      (-0.5) * (self.var + self.eps) ** (-1.5), axis=-1, keepdims=True)
        
        # 对 mean 的梯度
        dmean1 = np.sum(dx_norm * (-1) / np.sqrt(self.var + self.eps), axis=-1, keepdims=True)
        dmean2 = dvar * np.sum(-2 * (self.x.reshape(-1, hidden_size) - self.mean), axis=-1, keepdims=True) / N
        dmean = dmean1 + dmean2
        
        # 对输入 x 的梯度
        dx = (dx_norm / np.sqrt(self.var + self.eps) + 
              dvar * 2 * (self.x.reshape(-1, hidden_size) - self.mean) / N +
              dmean / N)
        
        return dx.reshape(batch_size, seq_len, hidden_size)
    
    def update(self, lr):
        """参数更新"""
        self.gamma -= lr * self.gamma_grad
        self.beta -= lr * self.beta_grad


# 使用 PyTorch 的实现
class LayerNormTorch(nn.Module):
    """PyTorch 版本 LayerNorm"""
    def __init__(self, hidden_size, eps=1e-5):
        super().__init__()
        self.norm = nn.LayerNorm(hidden_size, eps=eps)
    
    def forward(self, x):
        """
        Args:
            x: (batch_size, sequence_length, hidden_size)
        Returns:
            normalized_x: (batch_size, sequence_length, hidden_size)
        """
        return self.norm(x)


# 测试代码
if __name__ == "__main__":
    # 设置随机种子
    torch.manual_seed(42)
    np.random.seed(42)
    
    # 测试数据
    batch_size = 2
    seq_len = 3
    hidden_size = 4
    
    # 创建输入
    x_np = np.random.randn(batch_size, seq_len, hidden_size).astype(np.float32)
    x_torch = torch.tensor(x_np, requires_grad=True)
    
    # 测试手动实现
    print("=== 手动实现 ===")
    layernorm_manual = LayerNorm(hidden_size)
    output_manual = layernorm_manual.forward(x_np)
    print("输入 shape:", x_np.shape)
    print("输出 shape:", output_manual.shape)
    print("输出前两个元素:\n", output_manual[0, 0, :2])
    
    # 测试 PyTorch 实现
    print("\n=== PyTorch 实现 ===")
    layernorm_torch = LayerNormTorch(hidden_size)
    output_torch = layernorm_torch(x_torch)
    print("输出 shape:", output_torch.shape)
    print("输出前两个元素:\n", output_torch[0, 0, :2])
    
    # 比较结果
    print("\n=== 结果比较 ===")
    diff = np.abs(output_manual - output_torch.detach().numpy()).max()
    print(f"最大差异: {diff:.6f}")
    
    # 验证归一化性质
    print("\n=== 归一化验证 ===")
    # 随机选择一个样本位置
    sample_output = output_manual[0, 1, :]
    mean = np.mean(sample_output)
    std = np.std(sample_output)
    print(f"归一化后 - 均值: {mean:.6f}, 标准差: {std:.6f}")