Code practice for deep learning

1. Code practice for deep learning

1.1. Adadelta Optimizer

1
import numpy as np
2
3
def adadelta_optimizer(parameter, grad, u, v, rho=0.95, epsilon=1e-6):
4
"""
5
Update parameters using the AdaDelta optimizer.
6
AdaDelta is an extension of AdaGrad that seeks to reduce its aggressive,
7
monotonically decreasing learning rate.
8
Args:
9
parameter: Current parameter value
10
grad: Current gradient
11
u: Running average of squared gradients
12
v: Running average of squared parameter updates
13
rho: Decay rate for the moving average (default=0.95)
14
epsilon: Small constant for numerical stability (default=1e-6)
15
Returns:
16
tuple: (updated_parameter, updated_u, updated_v)
17
"""
18
assert 0 <= rho < 1, "Rho must be between 0 and 1"
19
assert epsilon > 0, "Epsilon must be positive"
20
assert all(u >= 0) if isinstance(u, np.ndarray) else u >= 0, "u must be non-negative"
21
assert all(v >= 0) if isinstance(v, np.ndarray) else v >= 0, "v must be non-negative"
22
23
# Update running average of squared gradients
24
u = rho * u + (1 - rho) * grad**2
25
26
# Compute RMS of gradient
27
RMS_g = np.sqrt(u + epsilon)
28
29
# Compute RMS of parameter updates
30
RMS_dx = np.sqrt(v + epsilon)
31
32
# Compute parameter update
33
dx = -RMS_dx / RMS_g * grad
34
35
# Update running average of squared parameter updates
36
v = rho * v + (1 - rho) * dx**2
37
38
# Update parameters
39
parameter = parameter + dx
40
41
return np.round(parameter, 5), np.round(u, 5), np.round(v, 5)

1.2. Position-wise Feed-Forward Block with Residual and Dropout

Numpy version:

1
import numpy as np
2
3
def ffn(x: list[float], W1: list[list[float]], b1: list[float], W2: list[list[float]], b2: list[float], dropout_p: float=0.1, seed: int=42) -> list[float]:
4
np.random.seed(seed)
5
x = np.array(x)
6
W1, b1, W2, b2 = np.array(W1), np.array(b1), np.array(W2), np.array(b2)
7
# First linear + ReLU
8
hidden = np.maximum(0, W1.dot(x) + b1)
9
# Second linear
10
out = W2.dot(hidden) + b2
11
# Dropout mask
12
mask = (np.random.rand(*out.shape) > dropout_p).astype(float)
13
out = out * mask / (1 - dropout_p)
14
# Residual
15
out = out + x
16
return [round(v, 4) for v in out.tolist()]

Pytorch version:

1
import torch
2
import torch.nn as nn
3
import torch.nn.functional as F
4
5
class FFNBlock(nn.Module):
6
def __init__(self, d_model, d_hidden, dropout_p=0.1):
7
super().__init__()
8
self.linear1 = nn.Linear(d_model, d_hidden)
9
self.linear2 = nn.Linear(d_hidden, d_model)
10
self.dropout = nn.Dropout(dropout_p)
11
12
def forward(self, x):
13
residual = x
14
out = F.relu(self.linear1(x))
15
out = self.linear2(out)
16
out = self.dropout(out)
17
out = out + residual
18
return torch.round(out * 10000) / 10000

1.3. Leaky ReLU Activation Function

1
def leaky_relu(z: float, alpha: float = 0.01) -> float|int:
2
return z if z > 0 else alpha * z

1.4. F-Score for Binary Classification

1
import numpy as np
2
3
def f_score(y_true, y_pred, beta):
4
tp = np.sum((y_true == 1) & (y_pred == 1))
5
fn = np.sum((y_true == 1) & (y_pred == 0))
6
fp = np.sum((y_true == 0) & (y_pred == 1))
7
8
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
9
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
10
11
op = precision * recall
12
div = ((beta**2) * precision) + recall
13
14
if div == 0 or op == 0:
15
return 0.0
16
17
score = (1 + (beta ** 2)) * op / div
18
return round(score, 3)

1.5. Train Logistic Regression with Gradient Descent

1
import numpy as np
2
3
def train_logreg(
4
X: np.ndarray,
5
y: np.ndarray,
6
learning_rate: float,
7
iterations: int,
8
*,
9
l2: float = 0.0, # L2 正则强度(不惩罚 bias)
10
batch_size: int | None = None, # None=全量GD;否则mini-batch
11
tol: float = 1e-6, # 早停阈值(相对改进)
12
patience: int = 5, # 早停耐心
13
standardize: bool = True, # 是否标准化特征
14
random_state: int = 42
15
) -> tuple[list[float], list[float]]:
16
"""
17
工业实践版 Logistic Regression(梯度下降):
18
- 稳定的 BCE(logits) 损失: logaddexp(0,z) - y*z
19
- 平均梯度;损失也按样本平均
20
- L2 正则(不作用于 bias)
21
- 可选 mini-batch;每轮洗牌
22
- 简单早停(相对改进 < tol 持续 patience 轮)
23
返回: (coeffs=[b,w1,...], losses=[per-iter mean loss])
24
"""
25
rng = np.random.default_rng(random_state)
26
m, n = X.shape
27
# 标准化特征(常见且必要),bias 单独加到 Xb
28
if standardize:
29
mu = X.mean(0, keepdims=True)
30
sigma = X.std(0, keepdims=True) + 1e-12
31
Xn = (X - mu) / sigma
32
else:
33
Xn = X.copy()
34
35
Xb = np.hstack([np.ones((m, 1)), Xn]) # [bias, features]
36
theta = np.zeros(n + 1)
37
losses, best_loss = [], np.inf
38
bad = 0 # 早停计数
39
40
if batch_size is None or batch_size <= 0 or batch_size > m:
41
batch_size = m # 全量 GD
42
43
for it in range(iterations):
44
# 每轮洗牌
45
idx = rng.permutation(m)
46
Xb_shuf, y_shuf = Xb[idx], y[idx]
47
48
epoch_loss = 0.0
49
for start in range(0, m, batch_size):
50
xb = Xb_shuf[start:start+batch_size]
51
yb = y_shuf[start:start+batch_size]
52
z = xb @ theta # logits
53
# mean BCE with logits (数值稳定)
54
batch_loss = np.mean(np.logaddexp(0.0, z) - yb * z)
55
56
# L2 正则(不惩罚 bias)
57
w = theta[1:]
58
reg = 0.5 * l2 * (w @ w)
59
loss = batch_loss + reg
60
61
# sigmoid & 平均梯度
62
p = 1.0 / (1.0 + np.exp(-z))
63
grad = (xb.T @ (p - yb)) / xb.shape[0]
64
# L2 梯度(不作用于 bias)
65
grad[1:] += l2 * w
66
67
theta -= learning_rate * grad
68
epoch_loss += loss * (xb.shape[0] / m) # 汇总为 epoch 均值
69
70
losses.append(float(np.round(epoch_loss, 4)))
71
72
# 简单早停
73
rel_impr = (best_loss - epoch_loss) / max(best_loss, 1.0)
74
if epoch_loss + 1e-12 < best_loss:
75
best_loss = epoch_loss
76
bad = 0
77
else:
78
bad += 1
79
if rel_impr < tol and bad >= patience:
80
break
81
82
# 若标准化过,需要把系数映射回原始特征尺度,方便下游使用
83
# 原模型: z = b + (x-μ)/σ · w = (b - μ/σ·w) + x · (w/σ)
84
b = theta[0]
85
w = theta[1:]
86
if standardize:
87
theta_out = np.empty_like(theta)
88
theta_out[1:] = w / (X.std(0) + 1e-12)
89
theta_out[0] = b - (X.mean(0) * theta_out[1:]).sum()
90
else:
91
theta_out = theta
92
93
coeffs = [float(np.round(v, 4)) for v in theta_out.tolist()]
94
return coeffs, losses
95