2-2 Regularization

np.square(Wl) 计算各元素的平方

1
2
>>> np.square([-1, 1])
array([1, 1], dtype=int32)

np.nansum() 求所有元素和的时候将非数字(NaN)当做 0。

1
2
3
4
5
6
7
8
9
10
11
12
13
>>> np.nansum([1, np.nan])
1.0
>>> a = np.array([[1, 1], [1, np.nan]])
>>> np.nansum(a)
3.0
>>> np.nansum(a, axis=0)
array([ 2., 1.])
>>> np.nansum([1, np.nan, np.inf])
inf
>>> np.nansum([1, np.nan, np.NINF])
-inf
>>> np.nansum([1, np.nan, np.inf, -np.inf]) # both +/- infinity present
nan

np.c_[xx.ravel(), yy.ravel()]
np.r_是按行连接两个矩阵,就是把两矩阵上下相加,要求行数相等。
np.c_是按列连接两个矩阵,就是把两矩阵左右相加,要求列数相等。
numpy.ravel((a, order=’C’)是将多维数组降为一维,等同于reshape(-1, order=order)。

1
2
3
4
5
6
7
8
>>> x = np.array([[1, 2, 3], [4, 5, 6]])
>>> print(np.ravel(x))
[1 2 3 4 5 6]
>>> print(x.reshape(-1))
[1 2 3 4 5 6]
>>>
>>> print(np.ravel(x, order='F'))
[1 4 2 5 3 6]

这里介绍解决过拟合的两个方法,一个是使用L2正则化,另一个是使用Dropout正则化。

算法步骤

首先先介绍一个三层神经网络的模型,对此网络进行正则化。

模型

模型参数中,lamdb是L2正则化的参数,keep_prob是Dropout的参数。当lamdb > 0 时,说明使用了L2正则化,这时,前向传播不变,要调用对应的损失函数compute_cost_with_regularization及对应的反向传播函数backward_propagation_with_regularization;当keep_prob != 1 时,说明使用了Dropout,这时,损失函数不变,要调用对应的前向传播函数forward_propagation_with_dropout及对应的反向传播函数backward_propagation_with_dropout。这些函数将会在接下来进行介绍。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1):
"""
Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
Arguments:
lambd -- regularization hyperparameter, scalar
keep_prob - probability of keeping a neuron active during drop-out, scalar.
Returns:
parameters -- parameters learned by the model. They can then be used to predict.
"""

grads = {}
costs = [] # to keep track of the cost
m = X.shape[1] # number of examples
layers_dims = [X.shape[0], 20, 3, 1]

# Initialize parameters dictionary.
parameters = initialize_parameters(layers_dims)

# Loop (gradient descent)
for i in range(0, num_iterations):

# Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
if keep_prob == 1:
a3, cache = forward_propagation(X, parameters)
elif keep_prob < 1:
a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)

# Cost function
if lambd == 0:
cost = compute_cost(a3, Y)
else:
cost = compute_cost_with_regularization(a3, Y, parameters, lambd)

# Backward propagation.
assert(lambd==0 or keep_prob==1) # it is possible to use both L2 regularization and dropout,
# but this assignment will only explore one at a time
if lambd == 0 and keep_prob == 1:
grads = backward_propagation(X, Y, cache)
elif lambd != 0:
grads = backward_propagation_with_regularization(X, Y, cache, lambd)
elif keep_prob < 1:
grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)

# Update parameters.
parameters = update_parameters(parameters, grads, learning_rate)

# Print the loss every 10000 iterations
if print_cost and i % 10000 == 0:
print("Cost after iteration {}: {}".format(i, cost))
if print_cost and i % 1000 == 0:
costs.append(cost)

# plot the cost
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('iterations (x1,000)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()

return parameters

L2 正则化

L2正则化依赖以下假设,即具有较小超参数的模型比较大的模型要更简单。
对于L2正则化,在原损失函数后面增加了对超参数W的惩罚项。
$$J_{regularized} = \small \underbrace{-\frac{1}{m} \sum\limits_{i = 1}^{m} \large{(}\small y^{[i]}\log\left(a^{[L][i]}\right) + (1-y^{[i]})\log\left(1- a^{[L][i]}\right) \large{)} }_\text{cross-entropy cost} + \underbrace{\frac{1}{m} \frac{\lambda}{2} \sum\limits_l\sum\limits_k\sum\limits_j W_{k,j}^{[l]2} }_\text{L2 regularization cost} $$
下面我们实现函数compute_cost_with_regularization()。

1
2
3
4
5
6
7
8
9
10
11
12
13
14

def compute_cost_with_regularization(A3, Y, parameters, lambd):
m = Y.shape[1]
W1 = parameters["W1"]
W2 = parameters["W2"]
W3 = parameters["W3"]

cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross-entropy part of the cost

L2_regularization_cost = (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) / m * lambd / 2

cost = cross_entropy_cost + L2_regularization_cost

return cost

损失函数发生改变后,反向传播也随之变化,dW1, dW2 和 dW3 的计算公式是发生变化了的,即要添加一个正则项的导数$\frac{d}{dW} ( \frac{1}{2}\frac{\lambda}{m} W^2) = \frac{\lambda}{m} W$。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

def backward_propagation_with_regularization(X, Y, cache, lambd):
m = X.shape[1]
(Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

dZ3 = A3 - Y

dW3 = 1./m * np.dot(dZ3, A2.T) + (lambd / m * W3)
db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)

dA2 = np.dot(W3.T, dZ3)
dZ2 = np.multiply(dA2, np.int64(A2 > 0))
dW2 = 1./m * np.dot(dZ2, A1.T) + (lambd / m * W2)
db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)

dA1 = np.dot(W2.T, dZ2)
dZ1 = np.multiply(dA1, np.int64(A1 > 0))
dW1 = 1./m * np.dot(dZ1, X.T) + (lambd / m * W1)
db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)

gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
"dZ1": dZ1, "dW1": dW1, "db1": db1}

return gradients

L2 正则化是我们的结果更加平滑,但是如果超参数 λ 选取的太大的话,就有可能 “oversmooth”, 从而导致模型具有较大的偏差(bias)。

Dropout 正则化

首先实现使用Dropout对应的前向传播函数forward_propagation_with_dropout。我们在第一个和第二个隐藏层使用Dropout,而在输入和输出层不使用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

def forward_propagation_with_dropout(X, parameters, keep_prob = 0.5):
"""
Implements the forward propagation: LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.
"""

np.random.seed(1)

# retrieve parameters
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]

# LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
Z1 = np.dot(W1, X) + b1
A1 = relu(Z1)

D1 = np.random.rand(A1.shape[0], A1.shape[1]) # Step 1: initialize matrix D1 = np.random.rand(..., ...)
D1 = (D1 < keep_prob) # Step 2: convert entries of D1 to 0 or 1 (using keep_prob as the threshold)
A1 = np.multiply(A1, D1) # Step 3: shut down some neurons of A1, you can think of D[1] as a mask,
A1 /= keep_prob # Step 4: scale the value of neurons that haven't been shut down

Z2 = np.dot(W2, A1) + b2
A2 = relu(Z2)

D2 = np.random.rand(A2.shape[0], A2.shape[1]) # Step 1: initialize matrix D2 = np.random.rand(..., ...)
D2 = (D2 < keep_prob) # Step 2: convert entries of D2 to 0 or 1 (using keep_prob as the threshold)
A2 = np.multiply(A2, D2) # Step 3: shut down some neurons of A2
A2 /= keep_prob # Step 4: scale the value of neurons that haven't been shut down

Z3 = np.dot(W3, A2) + b3
A3 = sigmoid(Z3)

cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)

return A3, cache

然后实现反向传播,在反向传播中,以dA1为例,要对dA1 也乘以同样的 D[1],并除以keep_prob。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

def backward_propagation_with_dropout(X, Y, cache, keep_prob):

m = X.shape[1]
(Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache

dZ3 = A3 - Y
dW3 = 1./m * np.dot(dZ3, A2.T)
db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)
dA2 = np.dot(W3.T, dZ3)

dA2 = np.multiply(D2, dA2) # Step 1: Apply mask D2 to shut down the same neurons as during the forward propagation
dA2 /= keep_prob # Step 2: Scale the value of neurons that haven't been shut down

dZ2 = np.multiply(dA2, np.int64(A2 > 0))
dW2 = 1./m * np.dot(dZ2, A1.T)
db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)

dA1 = np.dot(W2.T, dZ2)

dA1 = np.multiply(D1, dA1) # Step 1: Apply mask D1 to shut down the same neurons as during the forward propagation
dA1 /= keep_prob # Step 2: Scale the value of neurons that haven't been shut down

dZ1 = np.multiply(dA1, np.int64(A1 > 0))
dW1 = 1./m * np.dot(dZ1, X.T)
db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)

gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
"dZ1": dZ1, "dW1": dW1, "db1": db1}

return gradients

一个常见的误区就是同时在训练和测试中都使用Dropout,实际上,只能在训练过程中使用Dropout。