深度学习第一次作业np_mnist实验报告,:np_mnist实验报告,深度学习初探有何独到之处?

摘要:任务描述 用 numpy 实现训练 MLP 网络识别手写数字 MNIST 最终版本:网络结构 (text{[784, 512, 256, 10]}),损失函数设定为交叉熵函数 cross_entropy,激活函数设定为 ReLU,输出
任务描述 用 numpy 实现训练 MLP 网络识别手写数字 MNIST 最终版本:网络结构 \(\text{[784, 512, 256, 10]}\),损失函数设定为交叉熵函数 cross_entropy,激活函数设定为 ReLU,输出层激活函数采用 softmax,学习率 0.01,最终在测试集上准确率为 \(95.51\%\) 代码 np_mnist.py 实现了三种激活函数 ReLU,tanh,sigmoid,分别设定了不同的权重初始化函数 init_weights,输出三种情况的损失值,准确率 其中 ReLU 效果最好 如图所示 剩下两种情况如下,明显不如 RuLU 也尝试过不同的网络结构,如 \(\text{[784, 256, 10],[784, 256, 128, 10],[784, 512, 128, 10]}\) 最终 \(\text{[784, 512, 256, 10]}\) 效果最好 \(\text{np_mnist.py}\) # -*- coding: utf-8 -*- """ @ author: Yuanze Lei """ # 作业内容:更改loss函数、网络结构、激活函数,完成训练MLP网络识别手写数字MNIST数据集 import numpy as np from tqdm import tqdm # 加载数据集,numpy格式 X_train = np.load('./mnist/X_train.npy') # (60000, 784), 数值在0.0~1.0之间 y_train = np.load('./mnist/y_train.npy') # (60000, ) y_train = np.eye(10)[y_train] # (60000, 10), one-hot编码 X_val = np.load('./mnist/X_val.npy') # (10000, 784), 数值在0.0~1.0之间 y_val = np.load('./mnist/y_val.npy') # (10000,) y_val = np.eye(10)[y_val] # (10000, 10), one-hot编码 X_test = np.load('./mnist/X_test.npy') # (10000, 784), 数值在0.0~1.0之间 y_test = np.load('./mnist/y_test.npy') # (10000,) y_test = np.eye(10)[y_test] # (10000, 10), one-hot编码 # 定义激活函数 def ReLU(x): return np.maximum(0, x) def ReLU_prime(x): return (x > 0).astype(float) def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500))) def sigmoid_prime(x): s = sigmoid(x) return s * (1 - s) def tanh(x): return np.tanh(x) def tanh_prime(x): return 1 - np.tanh(x) ** 2 #输出层激活函数 def softmax(x): exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) return exp_x / (np.sum(exp_x, axis=1, keepdims=True) + 1e-8) def softmax_prime(x): s = softmax(x) return s * (1 - s) # 定义损失函数 def cross_entropy_loss(y_true, y_pred): y_pred = np.clip(y_pred, 1e-8, 1-1e-8) return -np.mean(np.sum(y_true * np.log(y_pred), axis=1)) def cross_entropy_loss_prime(y_true, y_pred): return y_pred - y_true # 定义权重初始化函数 def init_weights(shape=(), activation='ReLU'): if activation == 'ReLU': z = 2.0 / shape[0] elif activation == 'tanh': z = 1.0 / shape[0] else: z = 2.0 / (shape[0] + shape[1]) return np.random.normal(loc=0.0, scale=np.sqrt(z), size=shape) # 定义网络结构 class Network(object): ''' MNIST数据集分类网络 ''' def __init__(self, layer_dims, lr=0.01, activation='ReLU'): self.lr = lr if activation == 'ReLU': self.act = ReLU self.act_prime = ReLU_prime elif activation == 'sigmoid': self.act = sigmoid self.act_prime = sigmoid_prime elif activation == 'tanh': self.act = tanh self.act_prime = tanh_prime self.W = [] self.b = [] for i in range(len(layer_dims) - 1): self.W.append(init_weights((layer_dims[i], layer_dims[i+1]), activation)) self.b.append(np.zeros(layer_dims[i+1])) def forward(self, x): self.z = [] self.a = [x] for i in range(len(self.W) - 1): z = self.a[-1] @ self.W[i] + self.b[i] self.z.append(z) self.a.append(self.act(z)) z = self.a[-1] @ self.W[-1] + self.b[-1] self.z.append(z) self.a.append(softmax(z)) return self.a[-1] def step(self, x_batch, y_batch): ''' 一步训练 ''' batch_size = x_batch.shape[0] # 前向传播 self.forward(x_batch) # 计算损失和准确率 loss = cross_entropy_loss(y_batch, self.a[-1]) pred = np.argmax(self.a[-1], axis=1) true = np.argmax(y_batch, axis=1) acc = np.mean(pred == true) # 反向传播 delta = cross_entropy_loss_prime(y_batch, self.a[-1]) for i in range(len(self.W)-1, -1, -1): dW = self.a[i].T @ delta / batch_size db = np.mean(delta, axis=0) self.W[i] -= self.lr * dW self.b[i] -= self.lr * db if i > 0: delta = (delta @ self.W[i].T) * self.act_prime(self.z[i-1]) return loss, acc def evaluate(self, X, Y): pred = np.argmax(self.forward(X), axis=1) true = np.argmax(Y, axis=1) return np.mean(pred == true) if __name__ == '__main__': # 训练网络 for i in range(3): if i == 0: activation = 'ReLU' elif i == 1: activation = 'sigmoid' else: activation = 'tanh' print(f"=== 网络结构: [784, 512, 256, 10] + {activation} + cross_entropy ===") net = Network([784, 512, 256, 10], lr=0.01, activation=activation) for epoch in range(10): indices = np.random.permutation(len(X_train)) p_bar = tqdm(range(0, len(X_train), 64), disable=True) losses = [] accuracies = [] for i in p_bar: idx = indices[i: i+64] loss, acc = net.step(X_train[idx], y_train[idx]) losses.append(loss) accuracies.append(acc) p_bar.set_postfix({'loss': f'{loss:.4f}', 'accuracy': f'{acc:.4f}'}) val_acc = net.evaluate(X_val, y_val) print(f'epoch {epoch+1}: train_loss={np.mean(losses):.4f}, train_accuracy={np.mean(accuracies):.4f}, val_accuracy={val_acc:.4f}') test_acc = net.evaluate(X_test, y_test) print(f'\ntest_accuracy: {test_acc:.4f}\n')