Code practice for deep learning

September 1, 2025

by Leonardo

1. Code practice for deep learning

1.1. Adadelta Optimizer


1
import numpy as np
2
3
def adadelta_optimizer(parameter, grad, u, v, rho=0.95, epsilon=1e-6):
4
    """
5
    Update parameters using the AdaDelta optimizer.
6
    AdaDelta is an extension of AdaGrad that seeks to reduce its aggressive,
7
    monotonically decreasing learning rate.
8
    Args:
9
        parameter: Current parameter value
10
        grad: Current gradient
11
        u: Running average of squared gradients
12
        v: Running average of squared parameter updates
13
        rho: Decay rate for the moving average (default=0.95)
14
        epsilon: Small constant for numerical stability (default=1e-6)
15
    Returns:
16
        tuple: (updated_parameter, updated_u, updated_v)
17
    """
18
    assert 0 <= rho < 1, "Rho must be between 0 and 1"
19
    assert epsilon > 0, "Epsilon must be positive"
20
    assert all(u >= 0) if isinstance(u, np.ndarray) else u >= 0, "u must be non-negative"
21
    assert all(v >= 0) if isinstance(v, np.ndarray) else v >= 0, "v must be non-negative"
22
23
    # Update running average of squared gradients
24
    u = rho * u + (1 - rho) * grad**2
25
26
    # Compute RMS of gradient
27
    RMS_g = np.sqrt(u + epsilon)
28
29
    # Compute RMS of parameter updates
30
    RMS_dx = np.sqrt(v + epsilon)
31
32
    # Compute parameter update
33
    dx = -RMS_dx / RMS_g * grad
34
35
    # Update running average of squared parameter updates
36
    v = rho * v + (1 - rho) * dx**2
37
38
    # Update parameters
39
    parameter = parameter + dx
40
41
    return np.round(parameter, 5), np.round(u, 5), np.round(v, 5)

1.2. Position-wise Feed-Forward Block with Residual and Dropout

Numpy version:


1
import numpy as np
2
3
def ffn(x: list[float], W1: list[list[float]], b1: list[float], W2: list[list[float]], b2: list[float], dropout_p: float=0.1, seed: int=42) -> list[float]:
4
    np.random.seed(seed)
5
    x = np.array(x)
6
    W1, b1, W2, b2 = np.array(W1), np.array(b1), np.array(W2), np.array(b2)
7
    # First linear + ReLU
8
    hidden = np.maximum(0, W1.dot(x) + b1)
9
    # Second linear
10
    out = W2.dot(hidden) + b2
11
    # Dropout mask
12
    mask = (np.random.rand(*out.shape) > dropout_p).astype(float)
13
    out = out * mask / (1 - dropout_p)
14
    # Residual
15
    out = out + x
16
    return [round(v, 4) for v in out.tolist()]

Pytorch version:


1
import torch
2
import torch.nn as nn
3
import torch.nn.functional as F
4
5
class FFNBlock(nn.Module):
6
    def __init__(self, d_model, d_hidden, dropout_p=0.1):
7
        super().__init__()
8
        self.linear1 = nn.Linear(d_model, d_hidden)
9
        self.linear2 = nn.Linear(d_hidden, d_model)
10
        self.dropout = nn.Dropout(dropout_p)
11
12
    def forward(self, x):
13
        residual = x
14
        out = F.relu(self.linear1(x))
15
        out = self.linear2(out)
16
        out = self.dropout(out)
17
        out = out + residual
18
        return torch.round(out * 10000) / 10000

1.3. Leaky ReLU Activation Function


1
def leaky_relu(z: float, alpha: float = 0.01) -> float|int:
2
    return z if z > 0 else alpha * z

1.4. F-Score for Binary Classification


1
import numpy as np
2
3
def f_score(y_true, y_pred, beta):
4
    tp = np.sum((y_true == 1) & (y_pred == 1))
5
    fn = np.sum((y_true == 1) & (y_pred == 0))
6
    fp = np.sum((y_true == 0) & (y_pred == 1))
7
8
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
9
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
10
11
    op = precision * recall
12
    div = ((beta**2) * precision) + recall
13
14
    if div == 0 or op == 0:
15
        return 0.0
16
17
    score = (1 + (beta ** 2)) * op / div
18
    return round(score, 3)

1.5. Train Logistic Regression with Gradient Descent


1
import numpy as np
2
3
def train_logreg(
4
    X: np.ndarray,
5
    y: np.ndarray,
6
    learning_rate: float,
7
    iterations: int,
8
    *,
9
    l2: float = 0.0,          # L2 正则强度（不惩罚 bias）
10
    batch_size: int | None = None,  # None=全量GD；否则mini-batch
11
    tol: float = 1e-6,        # 早停阈值（相对改进）
12
    patience: int = 5,        # 早停耐心
13
    standardize: bool = True, # 是否标准化特征
14
    random_state: int = 42
15
) -> tuple[list[float], list[float]]:
16
    """
17
    工业实践版 Logistic Regression（梯度下降）:
18
    - 稳定的 BCE(logits) 损失: logaddexp(0,z) - y*z
19
    - 平均梯度；损失也按样本平均
20
    - L2 正则（不作用于 bias）
21
    - 可选 mini-batch；每轮洗牌
22
    - 简单早停(相对改进 < tol 持续 patience 轮)
23
    返回: (coeffs=[b,w1,...], losses=[per-iter mean loss])
24
    """
25
    rng = np.random.default_rng(random_state)
26
    m, n = X.shape
27
    # 标准化特征（常见且必要），bias 单独加到 Xb
28
    if standardize:
29
        mu = X.mean(0, keepdims=True)
30
        sigma = X.std(0, keepdims=True) + 1e-12
31
        Xn = (X - mu) / sigma
32
    else:
33
        Xn = X.copy()
34
35
    Xb = np.hstack([np.ones((m, 1)), Xn])  # [bias, features]
36
    theta = np.zeros(n + 1)
37
    losses, best_loss = [], np.inf
38
    bad = 0  # 早停计数
39
40
    if batch_size is None or batch_size <= 0 or batch_size > m:
41
        batch_size = m  # 全量 GD
42
43
    for it in range(iterations):
44
        # 每轮洗牌
45
        idx = rng.permutation(m)
46
        Xb_shuf, y_shuf = Xb[idx], y[idx]
47
48
        epoch_loss = 0.0
49
        for start in range(0, m, batch_size):
50
            xb = Xb_shuf[start:start+batch_size]
51
            yb = y_shuf[start:start+batch_size]
52
            z = xb @ theta                          # logits
53
            # mean BCE with logits (数值稳定)
54
            batch_loss = np.mean(np.logaddexp(0.0, z) - yb * z)
55
56
            # L2 正则（不惩罚 bias）
57
            w = theta[1:]
58
            reg = 0.5 * l2 * (w @ w)
59
            loss = batch_loss + reg
60
61
            # sigmoid & 平均梯度
62
            p = 1.0 / (1.0 + np.exp(-z))
63
            grad = (xb.T @ (p - yb)) / xb.shape[0]
64
            # L2 梯度（不作用于 bias）
65
            grad[1:] += l2 * w
66
67
            theta -= learning_rate * grad
68
            epoch_loss += loss * (xb.shape[0] / m)  # 汇总为 epoch 均值
69
70
        losses.append(float(np.round(epoch_loss, 4)))
71
72
        # 简单早停
73
        rel_impr = (best_loss - epoch_loss) / max(best_loss, 1.0)
74
        if epoch_loss + 1e-12 < best_loss:
75
            best_loss = epoch_loss
76
            bad = 0
77
        else:
78
            bad += 1
79
            if rel_impr < tol and bad >= patience:
80
                break
81
82
    # 若标准化过，需要把系数映射回原始特征尺度，方便下游使用
83
    # 原模型: z = b + (x-μ)/σ · w  = (b - μ/σ·w) + x · (w/σ)
84
    b = theta[0]
85
    w = theta[1:]
86
    if standardize:
87
        theta_out = np.empty_like(theta)
88
        theta_out[1:] = w / (X.std(0) + 1e-12)
89
        theta_out[0] = b - (X.mean(0) * theta_out[1:]).sum()
90
    else:
91
        theta_out = theta
92
93
    coeffs = [float(np.round(v, 4)) for v in theta_out.tolist()]
94
    return coeffs, losses
95

🔒 Access Restricted

Access Control

Code practice for deep learning

1. Code practice for deep learning

1.1. Adadelta Optimizer

1.2. Position-wise Feed-Forward Block with Residual and Dropout

1.3. Leaky ReLU Activation Function

1.4. F-Score for Binary Classification

1.5. Train Logistic Regression with Gradient Descent