Using everything I’ve learned in this first course, I’m trying to make a network in a neat python class, so I can extend it in future courses and have the cache handled internally. I think I understand the structure and the equations to solve for the weights and gradients, but even after re-implementing from scratch the network is only predicting 0s for each input, with output probabilities of in a similar range ie [[0.36001726 0.30796208 0.30430954 0.34583879, …]].
Structure of the network is as follows:
-
Initialize the network with
self.init_params()
, which makes a random weight matrix for each layer, and a bias vector of np.zeros that matches the size of the layer. -
Call model.fit(x, y) which then:
-
does a forward pass with
self.forward()
, compute the linear pass and a relu for all layers except the final layer, which has sigmoid. -
compute cost of this pass with
self.compute_cost()
-
compute the gradients with
self.backward()
. I think most of the complexity is here and the likely bug. -
Update the params with the learning rate times the gradients just computed.
-
-
Call
model.predict(x_test)
, which does a forward pass with x_test, and uses the sigmoid output of the final layer to compute probabilities. These all match and are always <.5, meaning a prediction of zero.
There is no problem with the train/test data, as a simple scikit-learn logistic regression is making reasonable predictions. Can you help me identify the error? I rewrote this again from scratch to the same outcome, did I transcribe one of the equations wrong?? Thank you very much for any help.
class nn:
def __init__(self, layers, lr=.005):
self.layers = layers
self.n_layers = len(layers)-1
self.lr = lr
self.grads = {}
self.params = self.init_params()
def init_params(self):
params = {}
L = self.layers
for l in range(1, len(L)):
params[f'w{l}'] = np.random.rand(L[l], L[l-1]) * 0.01
params[f'b{l}'] = np.zeros((L[l], 1))
return params
def relu(self, z):
return np.maximum(z, 0)
def sigmoid(self, z):
return 1 / (1+np.exp(-z))
def forward(self, x):
a = x
L = self.layers
n_layers = self.n_layers
par = self.params
for l in range(1, n_layers):
z = np.dot(self.params[f'w{l}'], a) + self.params[f'b{l}']
a = self.relu(z)
self.params[f'z{l}'] = z
self.params[f'a{l}'] = a
z = np.dot(self.params[f'w{n_layers}'], self.params[f'a{n_layers-1}']) + self.params[f'b{n_layers}']
a = self.sigmoid(z)
self.params[f'z{n_layers}'] = z
self.params[f'a{n_layers}'] = a
def compute_score(self):
# cross entropy loss
m = self.m
y = self.y
y_hat = self.params[f'a{self.n_layers}']
cost = (-1/m) * (np.dot(y, np.log(y_hat).T) + np.dot((1-y), np.log(1-y_hat).T))
def back_sigmoid(self, da, z):
s = 1/(1+np.exp(-z))
dz = da * s * (1-s)
return dz
def back_relu(self, da, z):
dz = np.array(da, copy=True)
dz[z <= 0] = 0
return dz
def back_linear(self, dz, a_prev, w):
m = self.m
dw = (1/m) * np.dot(dz, a_prev.T)
db = (1/m) * np.sum(dz, axis=1, keepdims=True)
da_prev = np.dot(w.T, dz)
return dw, db, da_prev
def backward(self):
# compute daL
m = len(self.y)
nl = self.n_layers
last_a = self.params[f'a{nl}']
dal = -(np.divide(self.y, last_a) - np.divide(1-self.y, 1-last_a))
self.grads[f'da{nl}'] = dal
# first backward step (sigmoid)
dzl = self.back_sigmoid(dal, self.params[f'z{nl}'])
self.grads[f'dz{nl}'] = dzl
a_prev = self.params[f'a{nl-1}']
dw, db, da_prev = self.back_linear(dzl, a_prev, self.params[f'w{nl}'])
self.grads[f'dw{nl}'] = dw
self.grads[f'db{nl}'] = db
self.grads[f'da{nl-1}'] = da_prev
# rest of backward steps (relu)
for l in reversed(range(1, nl)):
da = self.grads[f'da{l}']
dzl = self.back_relu(da, self.params[f'z{l}'])
self.grads[f'dz{l}'] = dzl
a_prev = self.params[f'a{l-1}']
dw, db, da_prev = self.back_linear(dzl, a_prev, self.params[f'w{l}'])
self.grads[f'dw{l}'] = dw
self.grads[f'db{l}'] = db
self.grads[f'da{l-1}'] = da_prev
def fit(self, x, y, n_cycles=100):
self.params['a0'] = x
self.y = y
self.m = y.shape[1]
for i in range(n_cycles):
self.forward(x)
self.compute_score()
self.backward()
self.update_params()
def update_params(self):
for l in range(1, len(self.layers)):
self.params[f'w{l}'] = self.params[f'w{l}'] - self.lr*self.grads[f'dw{l}']
self.params[f'b{l}'] = self.params[f'b{l}'] - self.lr*self.grads[f'db{l}']
def predict(self, x):
self.forward(x)
yhat = self.params[f'a{self.n_layers}']
print(yhat)
return 1. * (yhat >= .5)
layers = [12288, 5, 3, 1]
network = nn(layers)
network.fit(x_train, y_train, n_cycles=100)
network.predict(x_test) # gives all 0s