Poor Performance in Training the Network with Images

Hello everyone,

I’ve encountered difficulties during the initial training of my neural network with images, and the results were very disappointing. I’m seeking your advice and suggestions to improve the performance of my model in this particular phase of learning.

Below, you’ll find the relevant code along. Any assistance would be greatly appreciated. Thank you in advance for your contributions.

import math
import torch
import logging
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

class GRUModel(nn.Module):
    def __init__(self, input_dim=96, hidden_size=128, n_layers=1):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_size, n_layers, batch_first=True)

    def forward(self, x):
        out, _ = self.gru(x)        
        return out
    def init_weights(self):
        for name, param in self.gru.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight_ih' in name:
            elif 'weight_hh' in name:

class SpatialAttention(nn.Module):
    def __init__(self, input_dim):
        super(SpatialAttention, self).__init__()
        self.W = nn.Parameter(torch.Tensor(input_dim, 1)) 
        self.softmax = nn.Softmax(dim=1)
        torch.nn.init.kaiming_normal_(self.W, a=math.sqrt(5))

    def forward(self, x):
        e = torch.matmul(torch.tanh(x), self.W)
        alpha = torch.softmax(e, 1)
        context = torch.mul(x, alpha)
        context = torch.sum(context, dim=1)
        return context

class Dense_Layer(nn.Module):
    def __init__(self, in_chans=96, dropout_p=0.25):
        super(Dense_Layer, self).__init__()
        self.fc_out = nn.Linear(in_chans, 2)
        self.softmax = nn.Softmax(dim=1)

    def init_weights(self):

    def forward(self, x):
        x = self.fc_out(x)
        x = self.softmax(x)
        return x

class VisionEmbedding(nn.Module):
    def __init__(self, configs):
        super(VisionEmbedding, self).__init__()
        self.fc = nn.Sequential(nn.Linear(2048, 512), nn.ReLU())
        self.hm = GRUModel(input_dim=512, hidden_size=256)
        self.self_attention = SpatialAttention(input_dim=256)

    def forward(self, x):
        out = self.fc(x['local_image'].squeeze())
        out = self.hm(out)
        out = self.self_attention(out)
        return out 

class IntentPrediction(nn.Module):
    def __init__(self, configs= None):
        super(IntentPrediction, self).__init__()
        self.vision_encoder = VisionEmbedding(configs)
        self.dense_layer = Dense_Layer(in_chans = 256)

    def forward(self, x):
        vision_encoder = self.vision_encoder(x)
        out = self.dense_layer(vision_encoder)
        return out

Can I ask why you have a GRU in the middle of this model?

I included the GRU layer to capture temporal dependencies within the sequence of video frames, enhancing the model’s ability to understand the temporal dynamics present in the data.

1 Like