Code practice for deep learning
1. Code practice for deep learning
1.1. Adadelta Optimizer
1import numpy as np23def adadelta_optimizer(parameter, grad, u, v, rho=0.95, epsilon=1e-6):4 """5 Update parameters using the AdaDelta optimizer.6 AdaDelta is an extension of AdaGrad that seeks to reduce its aggressive,7 monotonically decreasing learning rate.8 Args:9 parameter: Current parameter value10 grad: Current gradient11 u: Running average of squared gradients12 v: Running average of squared parameter updates13 rho: Decay rate for the moving average (default=0.95)14 epsilon: Small constant for numerical stability (default=1e-6)15 Returns:16 tuple: (updated_parameter, updated_u, updated_v)17 """18 assert 0 <= rho < 1, "Rho must be between 0 and 1"19 assert epsilon > 0, "Epsilon must be positive"20 assert all(u >= 0) if isinstance(u, np.ndarray) else u >= 0, "u must be non-negative"21 assert all(v >= 0) if isinstance(v, np.ndarray) else v >= 0, "v must be non-negative"2223 # Update running average of squared gradients24 u = rho * u + (1 - rho) * grad**22526 # Compute RMS of gradient27 RMS_g = np.sqrt(u + epsilon)2829 # Compute RMS of parameter updates30 RMS_dx = np.sqrt(v + epsilon)3132 # Compute parameter update33 dx = -RMS_dx / RMS_g * grad3435 # Update running average of squared parameter updates36 v = rho * v + (1 - rho) * dx**23738 # Update parameters39 parameter = parameter + dx4041 return np.round(parameter, 5), np.round(u, 5), np.round(v, 5)
1.2. Position-wise Feed-Forward Block with Residual and Dropout
Numpy version:
1import numpy as np23def ffn(x: list[float], W1: list[list[float]], b1: list[float], W2: list[list[float]], b2: list[float], dropout_p: float=0.1, seed: int=42) -> list[float]:4 np.random.seed(seed)5 x = np.array(x)6 W1, b1, W2, b2 = np.array(W1), np.array(b1), np.array(W2), np.array(b2)7 # First linear + ReLU8 hidden = np.maximum(0, W1.dot(x) + b1)9 # Second linear10 out = W2.dot(hidden) + b211 # Dropout mask12 mask = (np.random.rand(*out.shape) > dropout_p).astype(float)13 out = out * mask / (1 - dropout_p)14 # Residual15 out = out + x16 return [round(v, 4) for v in out.tolist()]
Pytorch version:
1import torch2import torch.nn as nn3import torch.nn.functional as F45class FFNBlock(nn.Module):6 def __init__(self, d_model, d_hidden, dropout_p=0.1):7 super().__init__()8 self.linear1 = nn.Linear(d_model, d_hidden)9 self.linear2 = nn.Linear(d_hidden, d_model)10 self.dropout = nn.Dropout(dropout_p)1112 def forward(self, x):13 residual = x14 out = F.relu(self.linear1(x))15 out = self.linear2(out)16 out = self.dropout(out)17 out = out + residual18 return torch.round(out * 10000) / 10000
1.3. Leaky ReLU Activation Function
1def leaky_relu(z: float, alpha: float = 0.01) -> float|int:2 return z if z > 0 else alpha * z
1.4. F-Score for Binary Classification
1import numpy as np23def f_score(y_true, y_pred, beta):4 tp = np.sum((y_true == 1) & (y_pred == 1))5 fn = np.sum((y_true == 1) & (y_pred == 0))6 fp = np.sum((y_true == 0) & (y_pred == 1))78 recall = tp / (tp + fn) if (tp + fn) > 0 else 09 precision = tp / (tp + fp) if (tp + fp) > 0 else 01011 op = precision * recall12 div = ((beta**2) * precision) + recall1314 if div == 0 or op == 0:15 return 0.01617 score = (1 + (beta ** 2)) * op / div18 return round(score, 3)
1.5. Train Logistic Regression with Gradient Descent
1import numpy as np23def train_logreg(4 X: np.ndarray,5 y: np.ndarray,6 learning_rate: float,7 iterations: int,8 *,9 l2: float = 0.0, # L2 正则强度(不惩罚 bias)10 batch_size: int | None = None, # None=全量GD;否则mini-batch11 tol: float = 1e-6, # 早停阈值(相对改进)12 patience: int = 5, # 早停耐心13 standardize: bool = True, # 是否标准化特征14 random_state: int = 4215) -> tuple[list[float], list[float]]:16 """17 工业实践版 Logistic Regression(梯度下降):18 - 稳定的 BCE(logits) 损失: logaddexp(0,z) - y*z19 - 平均梯度;损失也按样本平均20 - L2 正则(不作用于 bias)21 - 可选 mini-batch;每轮洗牌22 - 简单早停(相对改进 < tol 持续 patience 轮)23 返回: (coeffs=[b,w1,...], losses=[per-iter mean loss])24 """25 rng = np.random.default_rng(random_state)26 m, n = X.shape27 # 标准化特征(常见且必要),bias 单独加到 Xb28 if standardize:29 mu = X.mean(0, keepdims=True)30 sigma = X.std(0, keepdims=True) + 1e-1231 Xn = (X - mu) / sigma32 else:33 Xn = X.copy()3435 Xb = np.hstack([np.ones((m, 1)), Xn]) # [bias, features]36 theta = np.zeros(n + 1)37 losses, best_loss = [], np.inf38 bad = 0 # 早停计数3940 if batch_size is None or batch_size <= 0 or batch_size > m:41 batch_size = m # 全量 GD4243 for it in range(iterations):44 # 每轮洗牌45 idx = rng.permutation(m)46 Xb_shuf, y_shuf = Xb[idx], y[idx]4748 epoch_loss = 0.049 for start in range(0, m, batch_size):50 xb = Xb_shuf[start:start+batch_size]51 yb = y_shuf[start:start+batch_size]52 z = xb @ theta # logits53 # mean BCE with logits (数值稳定)54 batch_loss = np.mean(np.logaddexp(0.0, z) - yb * z)5556 # L2 正则(不惩罚 bias)57 w = theta[1:]58 reg = 0.5 * l2 * (w @ w)59 loss = batch_loss + reg6061 # sigmoid & 平均梯度62 p = 1.0 / (1.0 + np.exp(-z))63 grad = (xb.T @ (p - yb)) / xb.shape[0]64 # L2 梯度(不作用于 bias)65 grad[1:] += l2 * w6667 theta -= learning_rate * grad68 epoch_loss += loss * (xb.shape[0] / m) # 汇总为 epoch 均值6970 losses.append(float(np.round(epoch_loss, 4)))7172 # 简单早停73 rel_impr = (best_loss - epoch_loss) / max(best_loss, 1.0)74 if epoch_loss + 1e-12 < best_loss:75 best_loss = epoch_loss76 bad = 077 else:78 bad += 179 if rel_impr < tol and bad >= patience:80 break8182 # 若标准化过,需要把系数映射回原始特征尺度,方便下游使用83 # 原模型: z = b + (x-μ)/σ · w = (b - μ/σ·w) + x · (w/σ)84 b = theta[0]85 w = theta[1:]86 if standardize:87 theta_out = np.empty_like(theta)88 theta_out[1:] = w / (X.std(0) + 1e-12)89 theta_out[0] = b - (X.mean(0) * theta_out[1:]).sum()90 else:91 theta_out = theta9293 coeffs = [float(np.round(v, 4)) for v in theta_out.tolist()]94 return coeffs, losses95