hi
i trained the model using pytorch framwork and coco dataset
when i test my model i got vary low level output could you please guide me to resolve this problem
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data.dataloader import DataLoader
# Define the TextToImageGAN model
class TextToImageGAN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, image_size, channels):
super(TextToImageGAN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.fc = nn.Linear(embedding_dim, hidden_dim * image_size * image_size)
self.conv = nn.Conv2d(hidden_dim, channels, kernel_size=3, stride=1, padding=1)
def forward(self, captions):
x = self.embedding(captions).sum(dim=1)
x = self.fc(x)
x = x.view(-1, hidden_dim, image_size, image_size)
x = torch.relu(x)
x = self.conv(x)
return x
# Simple vocabulary mapping for demonstration purposes
vocab = {'<unk>': 0, 'the': 1, 'a': 2, 'on': 3, 'with': 4, 'and': 5, 'of': 6, 'in': 7}
vocab_size = len(vocab)
# Function to transform text captions into tensors
def text_transform(caption):
return torch.tensor([vocab.get(token.lower(), vocab['<unk>']) for token in caption.split(' ')], dtype=torch.long)
# COCO dataset loading with specified transformations
train_images_dir = '/root/var/data/coco/train2017' # Update this path
annotations_file = '/root/var/data/coco/annotations/captions_train2017.json' # Update this path
coco_dataset = datasets.CocoCaptions(
root=train_images_dir,
annFile=annotations_file,
transform=transforms.Compose([
transforms.Resize((64, 64)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
)
# Custom function for batching
def collate_fn(batch):
images, caption_lists = zip(*batch)
captions = [caps[0] for caps in caption_lists]
processed_captions = [text_transform(caption) for caption in captions]
max_length = max(len(caption) for caption in processed_captions)
padded_captions = torch.stack([torch.cat([caption, torch.zeros(max_length - len(caption), dtype=torch.long)]) for caption in processed_captions])
images = torch.stack(images)
return padded_captions, images
# DataLoader setup
NUM_WORKERS = 8
BATCH_SIZE = 32
data_loader = DataLoader(coco_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=NUM_WORKERS)
# Model, loss function, optimizer, and training loop setup
hidden_dim = 512
embedding_dim = 256
image_size = 64
channels = 3
model = TextToImageGAN(vocab_size=10000, embedding_dim=embedding_dim, hidden_dim=hidden_dim, image_size=image_size, channels=channels)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
for captions, images in data_loader:
optimizer.zero_grad()
generated_images = model(captions)
loss = criterion(generated_images, images)
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
torch.save(model.state_dict(), "text_to_image_gan_final.pth")
```[quote="Najmul, post:1, topic:615865, full:true"]
hi
i trained the model using pytorch framwork and coco dataset
when i test my model i got vary low level output could you please guide me to resolve this problem
"
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data.dataloader import DataLoader
# Define the TextToImageGAN model
class TextToImageGAN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, image_size, channels):
super(TextToImageGAN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.fc = nn.Linear(embedding_dim, hidden_dim * image_size * image_size)
self.conv = nn.Conv2d(hidden_dim, channels, kernel_size=3, stride=1, padding=1)
def forward(self, captions):
x = self.embedding(captions).sum(dim=1)
x = self.fc(x)
x = x.view(-1, hidden_dim, image_size, image_size)
x = torch.relu(x)
x = self.conv(x)
return x
# Simple vocabulary mapping for demonstration purposes
vocab = {'<unk>': 0, 'the': 1, 'a': 2, 'on': 3, 'with': 4, 'and': 5, 'of': 6, 'in': 7}
vocab_size = len(vocab)
# Function to transform text captions into tensors
def text_transform(caption):
return torch.tensor([vocab.get(token.lower(), vocab['<unk>']) for token in caption.split(' ')], dtype=torch.long)
# COCO dataset loading with specified transformations
train_images_dir = '/root/var/data/coco/train2017' # Update this path
annotations_file = '/root/var/data/coco/annotations/captions_train2017.json' # Update this path
coco_dataset = datasets.CocoCaptions(
root=train_images_dir,
annFile=annotations_file,
transform=transforms.Compose([
transforms.Resize((64, 64)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
)
# Custom function for batching
def collate_fn(batch):
images, caption_lists = zip(*batch)
captions = [caps[0] for caps in caption_lists]
processed_captions = [text_transform(caption) for caption in captions]
max_length = max(len(caption) for caption in processed_captions)
padded_captions = torch.stack([torch.cat([caption, torch.zeros(max_length - len(caption), dtype=torch.long)]) for caption in processed_captions])
images = torch.stack(images)
return padded_captions, images
# DataLoader setup
NUM_WORKERS = 8
BATCH_SIZE = 32
data_loader = DataLoader(coco_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=NUM_WORKERS)
# Model, loss function, optimizer, and training loop setup
hidden_dim = 512
embedding_dim = 256
image_size = 64
channels = 3
model = TextToImageGAN(vocab_size=10000, embedding_dim=embedding_dim, hidden_dim=hidden_dim, image_size=image_size, channels=channels)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
for captions, images in data_loader:
optimizer.zero_grad()
generated_images = model(captions)
loss = criterion(generated_images, images)
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
torch.save(model.state_dict(), "text_to_image_gan_final.pth")
"
# my output image
![frame_0|64x64](upload://aOkfZWVQsRHeIH2dLpCEID1HHnj.png)
[/quote]
# my output image
![frame_0|64x64](upload://aOkfZWVQsRHeIH2dLpCEID1HHnj.png)