Training model test output not good

hi
i trained the model using pytorch framwork and coco dataset
when i test my model i got vary low level output could you please guide me to resolve this problem

import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data.dataloader import DataLoader

# Define the TextToImageGAN model
class TextToImageGAN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, image_size, channels):
        super(TextToImageGAN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, hidden_dim * image_size * image_size)
        self.conv = nn.Conv2d(hidden_dim, channels, kernel_size=3, stride=1, padding=1)

    def forward(self, captions):
        x = self.embedding(captions).sum(dim=1)
        x = self.fc(x)
        x = x.view(-1, hidden_dim, image_size, image_size)
        x = torch.relu(x)
        x = self.conv(x)
        return x

# Simple vocabulary mapping for demonstration purposes
vocab = {'<unk>': 0, 'the': 1, 'a': 2, 'on': 3, 'with': 4, 'and': 5, 'of': 6, 'in': 7}
vocab_size = len(vocab)

# Function to transform text captions into tensors
def text_transform(caption):
    return torch.tensor([vocab.get(token.lower(), vocab['<unk>']) for token in caption.split(' ')], dtype=torch.long)

# COCO dataset loading with specified transformations
train_images_dir = '/root/var/data/coco/train2017'  # Update this path
annotations_file = '/root/var/data/coco/annotations/captions_train2017.json'  # Update this path

coco_dataset = datasets.CocoCaptions(
    root=train_images_dir,
    annFile=annotations_file,
    transform=transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
)

# Custom function for batching
def collate_fn(batch):
    images, caption_lists = zip(*batch)
    captions = [caps[0] for caps in caption_lists]
    processed_captions = [text_transform(caption) for caption in captions]
    max_length = max(len(caption) for caption in processed_captions)
    padded_captions = torch.stack([torch.cat([caption, torch.zeros(max_length - len(caption), dtype=torch.long)]) for caption in processed_captions])
    images = torch.stack(images)
    return padded_captions, images

# DataLoader setup
NUM_WORKERS = 8
BATCH_SIZE = 32
data_loader = DataLoader(coco_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=NUM_WORKERS)

# Model, loss function, optimizer, and training loop setup
hidden_dim = 512
embedding_dim = 256
image_size = 64
channels = 3
model = TextToImageGAN(vocab_size=10000, embedding_dim=embedding_dim, hidden_dim=hidden_dim, image_size=image_size, channels=channels)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    for captions, images in data_loader:
        optimizer.zero_grad()
        generated_images = model(captions)
        loss = criterion(generated_images, images)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

torch.save(model.state_dict(), "text_to_image_gan_final.pth")
```[quote="Najmul, post:1, topic:615865, full:true"]
hi 
i trained the model using pytorch framwork and coco dataset
when i test my model i got vary low level output could you please guide me to resolve this problem 
"
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data.dataloader import DataLoader

# Define the TextToImageGAN model
class TextToImageGAN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, image_size, channels):
        super(TextToImageGAN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, hidden_dim * image_size * image_size)
        self.conv = nn.Conv2d(hidden_dim, channels, kernel_size=3, stride=1, padding=1)

    def forward(self, captions):
        x = self.embedding(captions).sum(dim=1)
        x = self.fc(x)
        x = x.view(-1, hidden_dim, image_size, image_size)
        x = torch.relu(x)
        x = self.conv(x)
        return x

# Simple vocabulary mapping for demonstration purposes
vocab = {'<unk>': 0, 'the': 1, 'a': 2, 'on': 3, 'with': 4, 'and': 5, 'of': 6, 'in': 7}
vocab_size = len(vocab)

# Function to transform text captions into tensors
def text_transform(caption):
    return torch.tensor([vocab.get(token.lower(), vocab['<unk>']) for token in caption.split(' ')], dtype=torch.long)

# COCO dataset loading with specified transformations
train_images_dir = '/root/var/data/coco/train2017'  # Update this path
annotations_file = '/root/var/data/coco/annotations/captions_train2017.json'  # Update this path

coco_dataset = datasets.CocoCaptions(
    root=train_images_dir,
    annFile=annotations_file,
    transform=transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
)

# Custom function for batching
def collate_fn(batch):
    images, caption_lists = zip(*batch)
    captions = [caps[0] for caps in caption_lists]
    processed_captions = [text_transform(caption) for caption in captions]
    max_length = max(len(caption) for caption in processed_captions)
    padded_captions = torch.stack([torch.cat([caption, torch.zeros(max_length - len(caption), dtype=torch.long)]) for caption in processed_captions])
    images = torch.stack(images)
    return padded_captions, images

# DataLoader setup
NUM_WORKERS = 8
BATCH_SIZE = 32
data_loader = DataLoader(coco_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=NUM_WORKERS)

# Model, loss function, optimizer, and training loop setup
hidden_dim = 512
embedding_dim = 256
image_size = 64
channels = 3
model = TextToImageGAN(vocab_size=10000, embedding_dim=embedding_dim, hidden_dim=hidden_dim, image_size=image_size, channels=channels)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    for captions, images in data_loader:
        optimizer.zero_grad()
        generated_images = model(captions)
        loss = criterion(generated_images, images)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

torch.save(model.state_dict(), "text_to_image_gan_final.pth")
"
# my output image 
![frame_0|64x64](upload://aOkfZWVQsRHeIH2dLpCEID1HHnj.png)
[/quote]


# my output image 
![frame_0|64x64](upload://aOkfZWVQsRHeIH2dLpCEID1HHnj.png)
1 Like

If you’re going to post your code, please use the “preformatted text” tag. This will make your code appear like code and not as Markdown.

1 Like

Thanks you

1 Like