Getting output mismatch for lstm_backward function. It seems I cannot figure out what’s causing the output mismatch. Just fyi, the previous function lstm_backward_cell works fine (i.e. output matches).
Here is my function:
def lstm_backward(da, caches):
"""
Implement the backward pass for the RNN with LSTM-cell (over a whole sequence).
Arguments:
da -- Gradients w.r.t the hidden states, numpy-array of shape (n_a, m, T_x)
caches -- cache storing information from the forward pass (lstm_forward)
Returns:
gradients -- python dictionary containing:
dx -- Gradient of inputs, of shape (n_x, m, T_x)
da0 -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
dWo -- Gradient w.r.t. the weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
dbo -- Gradient w.r.t. biases of the output gate, of shape (n_a, 1)
"""
# Retrieve values from the first cache (t=1) of caches.
(caches, x) = caches
(a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters) = caches[0]
### START CODE HERE ###
# Retrieve dimensions from da's and x1's shapes (≈2 lines)
n_a, m, T_x = da.shape
n_x, m = x1.shape
# initialize the gradients with the right sizes (≈12 lines)
dx = np.zeros((n_x, m, T_x))
da0 = np.zeros((n_a, m))
da_prevt = np.zeros((n_a, m)) # same as np.shape(da0.shape)
dc_prevt = np.zeros((n_a, m))
dWf = np.zeros((n_a, n_a + n_x))
dWi = np.zeros((n_a, n_a + n_x))
dWc = np.zeros((n_a, n_a + n_x))
dWo = np.zeros((n_a, n_a + n_x))
dbf = np.zeros((n_a, 1))
dbi = np.zeros((n_a, 1))
dbc = np.zeros((n_a, 1))
dbo = np.zeros((n_a, 1))
# loop back over the whole sequence
for t in reversed(range(T_x)):
# Compute all gradients using lstm_cell_backward
gradients = lstm_cell_backward(da[:, :, t] + da_prevt, dc_prevt, caches[t])
# Store or add the gradient to the parameters' previous step's gradient
da_prevt = gradients["da_prev"]
dc_prevt = gradients["dc_prev"]
dx[:,:,t] = gradients["dxt"]
dWf += gradients["dWf"]
dWi += gradients["dWi"]
dWc += gradients["dWc"]
dWo += gradients["dWo"]
dbf += gradients["dbf"]
dbi += gradients["dbi"]
dbc += gradients["dbc"]
dbo += gradients["dbo"]
# Set the first activation's gradient to the backpropagated gradient da_prev.
da0 = da_prevt
### END CODE HERE ###
# Store the gradients in a python dictionary
gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
"dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}
return gradients
gradients[“dx”][1][2] = [ 1.31969401e-04 1.27452521e-01 -5.41268133e-01 -4.32811154e-01]
gradients[“dx”].shape = (3, 10, 4)
gradients[“da0”][2][3] = 0.05309040679404092
gradients[“da0”].shape = (5, 10)
gradients[“dWf”][3][1] = -0.08880741124009006
gradients[“dWf”].shape = (5, 8)
gradients[“dWi”][1][2] = 0.21747041417406698
gradients[“dWi”].shape = (5, 8)
gradients[“dWc”][3][1] = -0.100572859366794
gradients[“dWc”].shape = (5, 8)
gradients[“dWo”][1][2] = 0.027904145579261054
gradients[“dWo”].shape = (5, 8)
gradients[“dbf”][4] = [-0.06003659]
gradients[“dbf”].shape = (5, 1)
gradients[“dbi”][4] = [-0.26347689]
gradients[“dbi”].shape = (5, 1)
gradients[“dbc”][4] = [-0.38110822]
gradients[“dbc”].shape = (5, 1)
gradients[“dbo”][4] = [-0.22944017]
gradients[“dbo”].shape = (5, 1)
Expected Output :
gradients[“dx”][1][2] = | [0.00218254 0.28205375 -0.48292508 -0.43281115] |
---|---|
gradients[“dx”].shape = | (3, 10, 4) |
gradients[“da0”][2][3] = | 0.312770310257 |
gradients[“da0”].shape = | (5, 10) |
gradients[“dWf”][3][1] = | -0.0809802310938 |
gradients[“dWf”].shape = | (5, 8) |
gradients[“dWi”][1][2] = | 0.40512433093 |
gradients[“dWi”].shape = | (5, 8) |
gradients[“dWc”][3][1] = | -0.0793746735512 |
gradients[“dWc”].shape = | (5, 8) |
gradients[“dWo”][1][2] = | 0.038948775763 |
gradients[“dWo”].shape = | (5, 8) |
gradients[“dbf”][4] = | [-0.15745657] |
gradients[“dbf”].shape = | (5, 1) |
gradients[“dbi”][4] = | [-0.50848333] |
gradients[“dbi”].shape = | (5, 1) |
gradients[“dbc”][4] = | [-0.42510818] |
gradients[“dbc”].shape = | (5, 1) |
gradients[“dbo”][4] = | [ -0.17958196] |
gradients[“dbo”].shape = | (5, 1) |
Here is the lstm_backward_cell function in case you wanted to look:
def lstm_cell_backward(da_next, dc_next, cache):
“”"
Implement the backward pass for the LSTM-cell (single time-step).
# Retrieve information from "cache"
(a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache
### START CODE HERE ###
# Retrieve dimensions from xt's and a_next's shape (≈2 lines)
n_x, m = xt.shape
n_a, m = a_next.shape
# Compute gates related derivatives. Their values can be found by looking carefully at equations (7) to (10) (≈4 lines)
dot = da_next * np.tanh(c_next) * ot * (1 - ot)
dcct = (dc_next * it + ot * (1 - np.square(np.tanh(c_next))) * it * da_next) * (1 - (cct) ** 2)
dit = (dc_next * cct + ot * (1 - np.square(np.tanh(c_next))) * cct * da_next) * it * (1 - it)
dft = (dc_next * c_prev + ot * (1 - np.square(np.tanh(c_next))) * c_prev * da_next) * ft * (1 - ft)
# Compute parameters related derivatives. Use equations (11)-(18) (≈8 lines)
dWf = np.dot(dft, np.concatenate((a_prev, xt), axis = 0).T)
dWi = np.dot(dit, np.concatenate((a_prev, xt), axis = 0).T)
dWc = np.dot(dcct, np.concatenate((a_prev, xt), axis = 0).T)
dWo = np.dot(dot, np.concatenate((a_prev, xt), axis = 0).T)
dbf = np.sum(dft, axis = 1, keepdims = True)
dbi = np.sum(dit, axis = 1, keepdims = True)
dbc = np.sum(dcct, axis = 1, keepdims = True)
dbo = np.sum(dot, axis = 1, keepdims = True)
# Compute derivatives w.r.t previous hidden state, previous memory state and input. Use equations (19)-(21). (≈3 lines)
da_prev = np.dot(parameters["Wf"][:, : n_a].T, dft) \
+ np.dot(parameters["Wi"][:, : n_a].T, dit) \
+ np.dot(parameters["Wc"][:, : n_a].T, dcct) \
+ np.dot(parameters["Wo"][:, : n_a].T, dot)
dc_prev = dc_next * ft + ot * (1 - np.square(np.tanh(c_next))) * dft * da_next
dxt = np.dot(parameters["Wf"][:, n_a :].T, dft) \
+ np.dot(parameters["Wi"][:, n_a :].T, dit) \
+ np.dot(parameters["Wc"][:, n_a :].T, dcct) \
+ np.dot(parameters["Wo"][:, n_a :].T, dot)
### END CODE HERE ###
# Save gradients in dictionary
gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
"dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}
return gradients