Implementing using pytorch does not produce same results

import numpy as np
import copy
import matplotlib.pyplot as plt
import h5py
import scipy
from PIL import Image
from scipy import ndimage
from lr_utils import load_dataset
from public_tests import *
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

num_epochs = 5000

# Loading the data (cat/non-cat)
train_set_x_orig, train_set_y, test_set_x_orig, test_set_y, classes = load_dataset()

# Flatten X for train and test
train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1)  # pytorch uses (batch_size, dim) 
test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1) #  pytorch uses (batch_size, dim)

# Reshape from (batch_size, dim) to (dim, batch_size). Note Y is already in this format so we don't convert
# train_set_x_flatten = train_set_x_flatten.T
# test_set_x_flatten = test_set_x_flatten.T

# Understand the shape of dataset
print(f"The dimensions X of the train set {train_set_x_flatten.shape}")
print(f"The dimensions Y of the train set {train_set_y.shape}")
print(f"The dimensions X of the test set {test_set_x_flatten.shape}")
print(f"The dimensions Y of the test set {test_set_y.shape}")

# Standardize dataset
train_set_x = train_set_x_flatten / 255.
test_set_x = test_set_x_flatten / 255.

class NN(nn.Module):
    def __init__(self, input_size):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(input_size, 1, bias=1)
    
    def forward(self, x):
        x = self.fc1(x)  
        x = F.sigmoid(x)
    
        return x


# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Initialize network
model = NN(input_size = train_set_x.shape[1]).to(device)

# Check final shape
test = np.random.randn(train_set_x.shape[0], train_set_x.shape[1]) # (batch_size, dim)
test = torch.tensor(test, dtype=torch.float32).to(device)
assert test.dtype == torch.float32, "Linear layer is float32, convert input to float32 as well"
result = model(test)
np_result = result.detach().cpu().numpy()
print(np_result.shape) # (num_classes, 1) (209, 1)
assert np_result.shape[0] == train_set_y.shape[0] and np_result.shape[1] == train_set_y.shape[1], "Output dimensions are not correct"

# Hyper-parameters
learning_rate = 0.001

# Setting up options
criterion = torch.nn.BCELoss() # Binary Cross Entrophy Loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


# Running test
def check_accuracy(X, Y):
    with torch.no_grad():
        X = torch.tensor(X, dtype=torch.float32).to(device)
        Y = torch.tensor(Y, dtype=torch.float32).to(device)
    
        Y_pred = model(X)
        accuracy = (Y_pred == Y).sum() / X.shape[0]
    return accuracy


# Using batch gradient descent, i.e all the data is used per epoch 
for epoch in range(num_epochs):
    
    train_set_x = torch.tensor(train_set_x, dtype=torch.float32).to(device)
    train_set_y = torch.tensor(train_set_y, dtype=torch.float32).to(device)

    prediction = model(train_set_x)
    loss = criterion(prediction, train_set_y)

    loss.backward() 

    #gradient descent with adam
    optimizer.step()

    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

accuracy_train = check_accuracy(train_set_x, train_set_y)
accuracy_test = check_accuracy(test_set_x, test_set_y)

print(f" Final Train Accuracy : {accuracy_train:.2f}, Final Test Accuracy : {accuracy_test:.2f}")