Object Localization MNIST lab, Tensorflow to Pytorch and losses doesn't decrease

I am trying to convert the Tensorflow object localization code into Pytorch to better understanding what is the difference between those two frameworks and deep dive into the provide preprocess Tensorflow code. In the original code model.compile and model.fit are used to train the model so I don’t understand how both the losses of classification of the MNIST digits and box regressions work. Still, I’m trying to implement my own training loop in Pytorch.

I set two losses : nn.CrossEntropyLoss and nn.MSELoss and I do (loss_1+loss_2).backward() to compute the gradients. I know it’s the right way to compute gradients with two losses from here and here.

But still, my loss doesn’t decrease whereas it collapses quasi-imediately as you know with the Tensorflow code. I checked the model with torchinfo.summary and it seems behaving as well as the Tensorflow implementation.

I looked for the predicted labels of my model and it doesn’t seem to change at all.
This line of code label_preds, bbox_coords_preds = model(digits) always returns the same values

label_preds[0] = tensor([[0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156]], device='cuda:0', grad_fn=<SliceBackward0>)

Here are my questions :

  • Is my custom network set correctly ?
  • Are my losses set correctly ?
  • Why my label predictions don’t change ?
  • Do my training loop work as well as the .compile and .fit Tensorflow methods ?

Thanks a lot !

PYTORCH CODE

class ConvNetwork(nn.Module):
    def __init__(self):
        super(ConvNetwork, self).__init__()
        self.conv2d_1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3)
        self.conv2d_2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
        self.conv2d_3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
        self.avgPooling2D = nn.AvgPool2d((2,2))
        self.dense_1 = nn.Linear(in_features=3136, out_features=128)
        
        self.dense_classifier = nn.Linear(in_features=128, out_features=10)
        self.softmax = nn.Softmax(dim=0)
        self.dense_regression = nn.Linear(in_features=128, out_features=4)


    def forward(self, input):
        x = self.avgPooling2D(F.relu(self.conv2d_1(input)))
        x = self.avgPooling2D(F.relu(self.conv2d_2(x)))
        x = self.avgPooling2D(F.relu(self.conv2d_3(x)))
        x = nn.Flatten()(x)
        x = F.relu(self.dense_1(x))

        output_classifier = self.softmax(self.dense_classifier(x))
        output_regression = self.dense_regression(x)
        return [output_classifier, output_regression]

######################################################

learning_rate = 0.1
EPOCHS = 1
BATCH_SIZE = 64

model = resNet_custom() #ConvNetwork()
model = model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
classification_loss = nn.CrossEntropyLoss()
regression_loss = nn.MSELoss()

######################################################

begin_time = time.time()
for epoch in range(EPOCHS) : 
    tot_loss = 0
    train_start = time.time()
    training_losses = []
    
    print("-"*20)
    print(" "*5 + f"EPOCH {epoch+1}/{EPOCHS}")
    print("-"*20)

    model.train()
    for batch, (digits, labels, bbox_coords) in enumerate(training_dataset):
        digits, labels, bbox_coords = digits.to(device), labels.to(device), bbox_coords.to(device)
        optimizer.zero_grad()
        
        [label_preds, bbox_coords_preds] = model(digits)
        
        class_loss = classification_loss(label_preds, labels)
        box_loss = regression_loss(bbox_coords_preds, bbox_coords)

        training_loss = class_loss + box_loss
        training_loss.backward()
        
        optimizer.step()
        
        ######### print part #######################
        training_losses.append(training_loss.item())
        if batch+1 <= len_training_ds//BATCH_SIZE:
            current_training_sample = (batch+1)*BATCH_SIZE
        else:
            current_training_sample = (batch)*BATCH_SIZE + len_training_ds%BATCH_SIZE
        
        if (batch+1) == 1 or (batch+1)%100 == 0 or (batch+1) == len_training_ds//BATCH_SIZE +1:
            print(f"Elapsed time : {(time.time()-train_start)/60:.3f}",\
                  f" --- Digit : {current_training_sample}/{len_training_ds}",\
                  f" : loss = {training_loss:.5f}")
            if batch+1 == (len_training_ds//BATCH_SIZE)+1:
                print(f"Total elapsed time for training : {(time.time()-begin_time)/60:.3f}")

ORIGINAL TENSORFLOW CODE

def feature_extractor(inputs):
    x = tf.keras.layers.Conv2D(16, activation='relu', kernel_size=3, input_shape=(75, 75, 1))(inputs)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(32,kernel_size=3,activation='relu')(x)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(64,kernel_size=3,activation='relu')(x)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    return x

def dense_layers(inputs):
  x = tf.keras.layers.Flatten()(inputs)
  x = tf.keras.layers.Dense(128, activation='relu')(x)
  return x

def classifier(inputs):

  classification_output = tf.keras.layers.Dense(10, activation='softmax', name = 'classification')(inputs)
  return classification_output


def bounding_box_regression(inputs):
    bounding_box_regression_output = tf.keras.layers.Dense(units = '4', name = 'bounding_box')(inputs)
    return bounding_box_regression_output


def final_model(inputs):
    feature_cnn = feature_extractor(inputs)
    dense_output = dense_layers(feature_cnn)

    classification_output = classifier(dense_output)
    bounding_box_output = bounding_box_regression(dense_output)

    model = tf.keras.Model(inputs = inputs, outputs = [classification_output,bounding_box_output])
    return model
  
def define_and_compile_model(inputs):
  model = final_model(inputs)
  model.compile(optimizer='adam', 
              loss = {'classification' : 'categorical_crossentropy',
                      'bounding_box' : 'mse'
                     },
              metrics = {'classification' : 'accuracy',
                         'bounding_box' : 'mse'
                        })
  return model

    

inputs = tf.keras.layers.Input(shape=(75, 75, 1,))
model = define_and_compile_model(inputs)


EPOCHS = 10 # 45
steps_per_epoch = 60000//BATCH_SIZE  # 60,000 items in this dataset
validation_steps = 1

history = model.fit(training_dataset,
                    steps_per_epoch=steps_per_epoch, 
                    validation_data=validation_dataset, 
                    validation_steps=validation_steps, epochs=EPOCHS)

loss, classification_loss, bounding_box_loss, classification_accuracy, bounding_box_mse = model.evaluate(validation_dataset, steps=1)
print("Validation accuracy: ", classification_accuracy)

One possible cause is the high learning rate. Set the learning rate to 0.001 and then train your model.