Using everything I’ve learned in this first course, I’m trying to make a network in a neat python class, so I can extend it in future courses and have the cache handled internally. I think I understand the structure and the equations to solve for the weights and gradients, but even after reimplementing from scratch the network is only predicting 0s for each input, with output probabilities of in a similar range ie [[0.36001726 0.30796208 0.30430954 0.34583879, …]].
Structure of the network is as follows:

Initialize the network with
self.init_params()
, which makes a random weight matrix for each layer, and a bias vector of np.zeros that matches the size of the layer. 
Call model.fit(x, y) which then:

does a forward pass with
self.forward()
, compute the linear pass and a relu for all layers except the final layer, which has sigmoid. 
compute cost of this pass with
self.compute_cost()

compute the gradients with
self.backward()
. I think most of the complexity is here and the likely bug. 
Update the params with the learning rate times the gradients just computed.


Call
model.predict(x_test)
, which does a forward pass with x_test, and uses the sigmoid output of the final layer to compute probabilities. These all match and are always <.5, meaning a prediction of zero.
There is no problem with the train/test data, as a simple scikitlearn logistic regression is making reasonable predictions. Can you help me identify the error? I rewrote this again from scratch to the same outcome, did I transcribe one of the equations wrong?? Thank you very much for any help.
class nn:
def __init__(self, layers, lr=.005):
self.layers = layers
self.n_layers = len(layers)1
self.lr = lr
self.grads = {}
self.params = self.init_params()
def init_params(self):
params = {}
L = self.layers
for l in range(1, len(L)):
params[f'w{l}'] = np.random.rand(L[l], L[l1]) * 0.01
params[f'b{l}'] = np.zeros((L[l], 1))
return params
def relu(self, z):
return np.maximum(z, 0)
def sigmoid(self, z):
return 1 / (1+np.exp(z))
def forward(self, x):
a = x
L = self.layers
n_layers = self.n_layers
par = self.params
for l in range(1, n_layers):
z = np.dot(self.params[f'w{l}'], a) + self.params[f'b{l}']
a = self.relu(z)
self.params[f'z{l}'] = z
self.params[f'a{l}'] = a
z = np.dot(self.params[f'w{n_layers}'], self.params[f'a{n_layers1}']) + self.params[f'b{n_layers}']
a = self.sigmoid(z)
self.params[f'z{n_layers}'] = z
self.params[f'a{n_layers}'] = a
def compute_score(self):
# cross entropy loss
m = self.m
y = self.y
y_hat = self.params[f'a{self.n_layers}']
cost = (1/m) * (np.dot(y, np.log(y_hat).T) + np.dot((1y), np.log(1y_hat).T))
def back_sigmoid(self, da, z):
s = 1/(1+np.exp(z))
dz = da * s * (1s)
return dz
def back_relu(self, da, z):
dz = np.array(da, copy=True)
dz[z <= 0] = 0
return dz
def back_linear(self, dz, a_prev, w):
m = self.m
dw = (1/m) * np.dot(dz, a_prev.T)
db = (1/m) * np.sum(dz, axis=1, keepdims=True)
da_prev = np.dot(w.T, dz)
return dw, db, da_prev
def backward(self):
# compute daL
m = len(self.y)
nl = self.n_layers
last_a = self.params[f'a{nl}']
dal = (np.divide(self.y, last_a)  np.divide(1self.y, 1last_a))
self.grads[f'da{nl}'] = dal
# first backward step (sigmoid)
dzl = self.back_sigmoid(dal, self.params[f'z{nl}'])
self.grads[f'dz{nl}'] = dzl
a_prev = self.params[f'a{nl1}']
dw, db, da_prev = self.back_linear(dzl, a_prev, self.params[f'w{nl}'])
self.grads[f'dw{nl}'] = dw
self.grads[f'db{nl}'] = db
self.grads[f'da{nl1}'] = da_prev
# rest of backward steps (relu)
for l in reversed(range(1, nl)):
da = self.grads[f'da{l}']
dzl = self.back_relu(da, self.params[f'z{l}'])
self.grads[f'dz{l}'] = dzl
a_prev = self.params[f'a{l1}']
dw, db, da_prev = self.back_linear(dzl, a_prev, self.params[f'w{l}'])
self.grads[f'dw{l}'] = dw
self.grads[f'db{l}'] = db
self.grads[f'da{l1}'] = da_prev
def fit(self, x, y, n_cycles=100):
self.params['a0'] = x
self.y = y
self.m = y.shape[1]
for i in range(n_cycles):
self.forward(x)
self.compute_score()
self.backward()
self.update_params()
def update_params(self):
for l in range(1, len(self.layers)):
self.params[f'w{l}'] = self.params[f'w{l}']  self.lr*self.grads[f'dw{l}']
self.params[f'b{l}'] = self.params[f'b{l}']  self.lr*self.grads[f'db{l}']
def predict(self, x):
self.forward(x)
yhat = self.params[f'a{self.n_layers}']
print(yhat)
return 1. * (yhat >= .5)
layers = [12288, 5, 3, 1]
network = nn(layers)
network.fit(x_train, y_train, n_cycles=100)
network.predict(x_test) # gives all 0s