深度学习第一次作业np_mnist实验报告,:np_mnist实验报告,深度学习初探有何独到之处?
摘要:任务描述 用 numpy 实现训练 MLP 网络识别手写数字 MNIST 最终版本:网络结构 (text{[784, 512, 256, 10]}),损失函数设定为交叉熵函数 cross_entropy,激活函数设定为 ReLU,输出
任务描述
用 numpy 实现训练 MLP 网络识别手写数字 MNIST
最终版本:网络结构 \(\text{[784, 512, 256, 10]}\),损失函数设定为交叉熵函数 cross_entropy,激活函数设定为 ReLU,输出层激活函数采用 softmax,学习率 0.01,最终在测试集上准确率为 \(95.51\%\)
代码 np_mnist.py 实现了三种激活函数 ReLU,tanh,sigmoid,分别设定了不同的权重初始化函数 init_weights,输出三种情况的损失值,准确率
其中 ReLU 效果最好
如图所示
剩下两种情况如下,明显不如 RuLU
也尝试过不同的网络结构,如 \(\text{[784, 256, 10],[784, 256, 128, 10],[784, 512, 128, 10]}\)
最终 \(\text{[784, 512, 256, 10]}\) 效果最好
\(\text{np_mnist.py}\)
# -*- coding: utf-8 -*-
"""
@ author: Yuanze Lei
"""
# 作业内容:更改loss函数、网络结构、激活函数,完成训练MLP网络识别手写数字MNIST数据集
import numpy as np
from tqdm import tqdm
# 加载数据集,numpy格式
X_train = np.load('./mnist/X_train.npy') # (60000, 784), 数值在0.0~1.0之间
y_train = np.load('./mnist/y_train.npy') # (60000, )
y_train = np.eye(10)[y_train] # (60000, 10), one-hot编码
X_val = np.load('./mnist/X_val.npy') # (10000, 784), 数值在0.0~1.0之间
y_val = np.load('./mnist/y_val.npy') # (10000,)
y_val = np.eye(10)[y_val] # (10000, 10), one-hot编码
X_test = np.load('./mnist/X_test.npy') # (10000, 784), 数值在0.0~1.0之间
y_test = np.load('./mnist/y_test.npy') # (10000,)
y_test = np.eye(10)[y_test] # (10000, 10), one-hot编码
# 定义激活函数
def ReLU(x):
return np.maximum(0, x)
def ReLU_prime(x):
return (x > 0).astype(float)
def sigmoid(x):
return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_prime(x):
s = sigmoid(x)
return s * (1 - s)
def tanh(x):
return np.tanh(x)
def tanh_prime(x):
return 1 - np.tanh(x) ** 2
#输出层激活函数
def softmax(x):
exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
return exp_x / (np.sum(exp_x, axis=1, keepdims=True) + 1e-8)
def softmax_prime(x):
s = softmax(x)
return s * (1 - s)
# 定义损失函数
def cross_entropy_loss(y_true, y_pred):
y_pred = np.clip(y_pred, 1e-8, 1-1e-8)
return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
def cross_entropy_loss_prime(y_true, y_pred):
return y_pred - y_true
# 定义权重初始化函数
def init_weights(shape=(), activation='ReLU'):
if activation == 'ReLU': z = 2.0 / shape[0]
elif activation == 'tanh': z = 1.0 / shape[0]
else: z = 2.0 / (shape[0] + shape[1])
return np.random.normal(loc=0.0, scale=np.sqrt(z), size=shape)
# 定义网络结构
class Network(object):
'''
MNIST数据集分类网络
'''
def __init__(self, layer_dims, lr=0.01, activation='ReLU'):
self.lr = lr
if activation == 'ReLU':
self.act = ReLU
self.act_prime = ReLU_prime
elif activation == 'sigmoid':
self.act = sigmoid
self.act_prime = sigmoid_prime
elif activation == 'tanh':
self.act = tanh
self.act_prime = tanh_prime
self.W = []
self.b = []
for i in range(len(layer_dims) - 1):
self.W.append(init_weights((layer_dims[i], layer_dims[i+1]), activation))
self.b.append(np.zeros(layer_dims[i+1]))
def forward(self, x):
self.z = []
self.a = [x]
for i in range(len(self.W) - 1):
z = self.a[-1] @ self.W[i] + self.b[i]
self.z.append(z)
self.a.append(self.act(z))
z = self.a[-1] @ self.W[-1] + self.b[-1]
self.z.append(z)
self.a.append(softmax(z))
return self.a[-1]
def step(self, x_batch, y_batch):
'''
一步训练
'''
batch_size = x_batch.shape[0]
# 前向传播
self.forward(x_batch)
# 计算损失和准确率
loss = cross_entropy_loss(y_batch, self.a[-1])
pred = np.argmax(self.a[-1], axis=1)
true = np.argmax(y_batch, axis=1)
acc = np.mean(pred == true)
# 反向传播
delta = cross_entropy_loss_prime(y_batch, self.a[-1])
for i in range(len(self.W)-1, -1, -1):
dW = self.a[i].T @ delta / batch_size
db = np.mean(delta, axis=0)
self.W[i] -= self.lr * dW
self.b[i] -= self.lr * db
if i > 0: delta = (delta @ self.W[i].T) * self.act_prime(self.z[i-1])
return loss, acc
def evaluate(self, X, Y):
pred = np.argmax(self.forward(X), axis=1)
true = np.argmax(Y, axis=1)
return np.mean(pred == true)
if __name__ == '__main__':
# 训练网络
for i in range(3):
if i == 0: activation = 'ReLU'
elif i == 1: activation = 'sigmoid'
else: activation = 'tanh'
print(f"=== 网络结构: [784, 512, 256, 10] + {activation} + cross_entropy ===")
net = Network([784, 512, 256, 10], lr=0.01, activation=activation)
for epoch in range(10):
indices = np.random.permutation(len(X_train))
p_bar = tqdm(range(0, len(X_train), 64), disable=True)
losses = []
accuracies = []
for i in p_bar:
idx = indices[i: i+64]
loss, acc = net.step(X_train[idx], y_train[idx])
losses.append(loss)
accuracies.append(acc)
p_bar.set_postfix({'loss': f'{loss:.4f}', 'accuracy': f'{acc:.4f}'})
val_acc = net.evaluate(X_val, y_val)
print(f'epoch {epoch+1}: train_loss={np.mean(losses):.4f}, train_accuracy={np.mean(accuracies):.4f}, val_accuracy={val_acc:.4f}')
test_acc = net.evaluate(X_test, y_test)
print(f'\ntest_accuracy: {test_acc:.4f}\n')
