In the course GenAI with LLM, I am trying to run Lab2 using my own AWS resources. I am using an ml.g4dn.8xlarge instance with GPU. However, when trying to run the notebook I am getting various errors related to data and model not on same device. Does anyone have an example of working code to run the lab with GPU / CUDA? Thanks
To add a little more details. Everything works if I move all the models to CPU with model = model.to(‘cpu’). However if the models are on device cuda, then I’m not sure the best way to also move the data to GPU. The data is provided by a DatasetDict class and passed into a Trainer class. Not sure the best way to tell the Trainer to move data to CUDA for each batch.
Here is the precise error I am getting:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_10739/4239511286.py in <cell line: 1>()
----> 1 peft_trainer.train()
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1622 hf_hub_utils.enable_progress_bars()
1623 else:
-> 1624 return inner_training_loop(
1625 args=args,
1626 resume_from_checkpoint=resume_from_checkpoint,
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/accelerate/utils/memory.py in decorator(*args, **kwargs)
136 raise RuntimeError("No executable batch size found, reached zero.")
137 try:
--> 138 return function(batch_size, *args, **kwargs)
139 except Exception as e:
140 if should_reduce_batch_size(e):
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1926
1927 step = -1
-> 1928 for step, inputs in enumerate(epoch_iterator):
1929 total_batched_samples += 1
1930
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/accelerate/data_loader.py in __iter__(self)
450 # We iterate one batch ahead to check when we are at the end
451 try:
--> 452 current_batch = next(dataloader_iter)
453 except StopIteration:
454 yield
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/torch/utils/data/dataloader.py in __next__(self)
632 # TODO(https://github.com/pytorch/pytorch/issues/76750)
633 self._reset() # type: ignore[call-arg]
--> 634 data = self._next_data()
635 self._num_yielded += 1
636 if self._dataset_kind == _DatasetKind.Iterable and \
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/torch/utils/data/dataloader.py in _next_data(self)
675
676 def _next_data(self):
--> 677 index = self._next_index() # may raise StopIteration
678 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
679 if self._pin_memory:
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/torch/utils/data/dataloader.py in _next_index(self)
622
623 def _next_index(self):
--> 624 return next(self._sampler_iter) # may raise StopIteration
625
626 def _next_data(self):
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/torch/utils/data/sampler.py in __iter__(self)
252 batch = [0] * self.batch_size
253 idx_in_batch = 0
--> 254 for idx in self.sampler:
255 batch[idx_in_batch] = idx
256 idx_in_batch += 1
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/accelerate/data_loader.py in __iter__(self)
90 # print("Setting seed at epoch", self.epoch, seed)
91 self.generator.manual_seed(seed)
---> 92 yield from super().__iter__()
93 self.set_epoch(self.epoch + 1)
94
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/torch/utils/data/sampler.py in __iter__(self)
130 else:
131 for _ in range(self.num_samples // n):
--> 132 yield from torch.randperm(n, generator=generator).tolist()
133 yield from torch.randperm(n, generator=generator).tolist()[:self.num_samples % n]
134
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/torch/utils/_device.py in __torch_function__(self, func, types, args, kwargs)
60 if func in _device_constructors() and kwargs.get('device') is None:
61 kwargs['device'] = self.device
---> 62 return func(*args, **kwargs)
63
64 # NB: This is directly called from C++ in torch/csrc/Device.cpp
RuntimeError: Expected a 'cuda' device type for generator but found 'cpu'
You should be able to use the to() method on tensors, just the way you did on the model:
myTensor = myTensor.to(device)
Disclaimer: I’m not a mentor for this course and have not looked at any of the assignments here, so I will not be able to answer more detailed questions that involve the actual specs of the assignment.