## python实现卷积层的前向后向传播过程 heibanke写于 2016年10月11日 --- python.

出自：网易云课堂视频课程《用Python做深度学习1——数学基础》



# Convolution Layer Forward

$$\ out_{n,f,ho,wo} = conv(XP,W,b,params)$$ $$\ =\sum\limits_{c = 0,ho = 0,wo = 0}^{C - 1,Ho - 1,Wo - 1} {{XP_{n,c,ho * S + (1:HH),wo * S + (1:WW)}}*{W_{f,c,:,:}}} + b_{f}$$

n是输入的个数，比如输入100张图片，n＝100.

C是input channel，比如输入的图片是RGB三通道的，C＝3.

S是stride，stride为1，逐行扫描。stride为2，隔一行扫描一次。不理解stride的还要先查查其它文章。

XP是填0后的输入。若不填0，则XP＝X. 不理解填0操作的还要先查查其它文章。

F是filter number，系数的高和宽分别是HH，WW。Ho和Wo是输出的高，宽。


import numpy as np

def conv_forward_naive(x, w, b, conv_param):
"""
A naive implementation of the forward pass for a convolutional layer.

The input consists of N data points, each with C channels, height H and width
W. We convolve each input with F different filters, where each filter spans
all C channels and has height HH and width HH.

Input:
- x: Input data of shape (N, C, H, W)
- w: Filter weights of shape (F, C, HH, WW)
- b: Biases, of shape (F,)
- conv_param: A dictionary with the following keys:
- 'stride': The number of pixels between adjacent receptive fields in the
horizontal and vertical directions.
- 'pad': The number of pixels that will be used to zero-pad the input.

Returns a tuple of:
- out: Output data, of shape (N, F, H', W') where H' and W' are given by
H' = 1 + (H + 2 * pad - HH) / stride
W' = 1 + (W + 2 * pad - WW) / stride
- cache: (x, w, b, conv_param)
"""
out = None
N,C,H,W = x.shape
F,_,HH,WW = w.shape
S = conv_param['stride']
Ho = 1 + (H + 2 * P - HH) / S
Wo = 1 + (W + 2 * P - WW) / S
out = np.zeros((N,F,Ho,Wo))

for f in xrange(F):
for i in xrange(Ho):
for j in xrange(Wo):
# N*C*HH*WW, C*HH*WW = N*C*HH*WW, sum -> N*1
out[:,f,i,j] = np.sum(x_pad[:, :, i*S : i*S+HH, j*S : j*S+WW] * w[f, :, :, :], axis=(1, 2, 3))

out[:,f,:,:]+=b[f]
cache = (x, w, b, conv_param)
return out, cache

 


x_shape = (2, 3, 4, 4) #n,c,h,w
w_shape = (2, 3, 3, 3) #f,c,hw,ww
x = np.ones(x_shape)
w = np.ones(w_shape)
b = np.array([1,2])

conv_param = {'stride': 1, 'pad': 0}
out, _ = conv_forward_naive(x, w, b, conv_param)

print out
print out.shape  #n,f,ho,wo

 

[[[[ 28. 28.] [ 28. 28.]]

[[ 29. 29.] [ 29. 29.]]]

[[[ 28. 28.] [ 28. 28.]]

[[ 29. 29.] [ 29. 29.]]]]

(2, 2, 2, 2)

# Convolution Layer Backward

$$\frac{{\partial L}}{{\partial w}} = \frac{{\partial L}}{{\partial out}}*\frac{{\partial out}}{{\partial w}}$$

$$\frac{{\partial L}}{{\partial x}} = \frac{{\partial L}}{{\partial out}}*\frac{{\partial out}}{{\partial x}}$$

$$\frac{{\partial L}}{{\partial b}} = \frac{{\partial L}}{{\partial out}}*\frac{{\partial out}}{{\partial b}}$$

$$\frac{{\partial L}}{{\partial out}} = dout$$

dout在卷积层的后向过程是已知的，所以公式看上去很简单，就是下标处理复杂了点。我们慢慢来继续推导它。

$$\frac{{\partial L}}{{\partial {W_{f,c,:,:}}}} = \sum\limits_{n = 0,ho = 0,wo = 0}^{N - 1,Ho - 1,Wo - 1} {dou{t_{n,f,ho,wo}}} * \frac{{\partial ({XP_{n,c,h_{win},w_{win}}} * {W_{f,c,:,:}})}}{{\partial {W_{f,c,:,:}}}}$$ $$\ = \sum\limits_{n = 0,ho = 0,wo = 0}^{N - 1,Ho - 1,Wo - 1} {dou{t_{n,f,ho,wo}} * {XP_{n,c,h_{win},w_{win}}}}$$

$$\frac{{\partial L}}{{\partial {XP_{n,c,h_{win},w_{win}}}}} = \sum\limits_{f = 0,ho = 0,wo = 0}^{F - 1,Ho - 1,Wo - 1} {dou{t_{n,f,ho,wo}}} * \frac{{\partial ({XP_{n,c,h_{win},w_{win}}} * {w_{f,c,:,:}})}}{{\partial {X_{n,c,h_{win},w_{win}}}}}$$ $$\ = \sum\limits_{f = 0,ho = 0,wo = 0}^{F - 1,Ho - 1,Wo - 1} {dou{t_{n,f,ho,wo}} * {W_{f,c,:,:}}}$$

$$\frac{{\partial L}}{{\partial {b_f}}} = \sum\limits_{n = 0,ho = 0,wo = 0}^{N - 1,Ho - 1,Wo - 1} {dou{t_{n,f,ho,wo}}} * \frac{{\partial ({XP_{n,c,h_{win},w_{win}}} * {W_{f,c,:,:}}+b_f)}}{{\partial {b_f}}}$$ $$\ = \sum\limits_{n = 0,ho = 0,wo = 0}^{N - 1,Ho - 1,Wo - 1} {dou{t_{n,f,ho,wo}}}$$


def conv_backward_naive(dout, cache):
"""
A naive implementation of the backward pass for a convolutional layer.

Inputs:
- dout: Upstream derivatives.
- cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive

Returns a tuple of:
- dx: Gradient with respect to x
- dw: Gradient with respect to w
- db: Gradient with respect to b
"""
dx, dw, db = None, None, None

N, F, H1, W1 = dout.shape
x, w, b, conv_param = cache
N, C, H, W = x.shape
HH = w.shape[2]
WW = w.shape[3]
S = conv_param['stride']

dx, dw, db = np.zeros_like(x), np.zeros_like(w), np.zeros_like(b)
db = np.sum(dout, axis=(0,2,3))

for n in xrange(N):
for i in xrange(H1):
for j in xrange(W1):
# Window we want to apply the respective f th filter over (C, HH, WW)
x_window = x_pad[n, :, i * S : i * S + HH, j * S : j * S + WW]

for f in xrange(F):
dw[f] += x_window * dout[n, f, i, j] #F,C,HH,WW
#C,HH,WW
dx_pad[n, :, i * S : i * S + HH, j * S : j * S + WW] += w[f] * dout[n, f, i, j]

dx = dx_pad[:, :, P:P+H, P:P+W]

return dx, dw, db

 


x_shape = (2, 3, 4, 4)
w_shape = (2, 3, 3, 3)
x = np.ones(x_shape)
w = np.ones(w_shape)
b = np.array([1,2])

conv_param = {'stride': 1, 'pad': 0}

Wo = Ho

dout = np.ones((x_shape[0], w_shape[0], Ho, Wo))

out, cache = conv_forward_naive(x, w, b, conv_param)
dx, dw, db = conv_backward_naive(dout, cache)

print "out shape",out.shape
print "dw=========================="
print dw
print "dx=========================="
print dx
print "db=========================="
print db

 

out shape (2, 2, 2, 2)

dw==========================

[[[[ 8. 8. 8.] [ 8. 8. 8.] [ 8. 8. 8.]]

[[ 8. 8. 8.] [ 8. 8. 8.] [ 8. 8. 8.]]

[[ 8. 8. 8.] [ 8. 8. 8.] [ 8. 8. 8.]]]

[[[ 8. 8. 8.] [ 8. 8. 8.] [ 8. 8. 8.]]

[[ 8. 8. 8.] [ 8. 8. 8.] [ 8. 8. 8.]]

[[ 8. 8. 8.] [ 8. 8. 8.] [ 8. 8. 8.]]]]

dx==========================

[[[[ 2. 4. 4. 2.] [ 4. 8. 8. 4.] [ 4. 8. 8. 4.] [ 2. 4. 4. 2.]]

[[ 2. 4. 4. 2.] [ 4. 8. 8. 4.] [ 4. 8. 8. 4.] [ 2. 4. 4. 2.]]

[[ 2. 4. 4. 2.] [ 4. 8. 8. 4.] [ 4. 8. 8. 4.] [ 2. 4. 4. 2.]]]

[[[ 2. 4. 4. 2.] [ 4. 8. 8. 4.] [ 4. 8. 8. 4.] [ 2. 4. 4. 2.]]

[[ 2. 4. 4. 2.] [ 4. 8. 8. 4.] [ 4. 8. 8. 4.] [ 2. 4. 4. 2.]]

[[ 2. 4. 4. 2.] [ 4. 8. 8. 4.] [ 4. 8. 8. 4.] [ 2. 4. 4. 2.]]]]

db==========================

[ 8. 8.]