def tokenize_function(example):
start_prompt = 'Summarize the following conversation.\n\n'
end_prompt = '\n\nSummary: '
prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
return example
# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
TypeError: Provided function
which is applied to all elements of table returns a dict
of types [<class ‘list’>,
<class ‘list’>, <class ‘list’>, <class ‘list’>, <class ‘torch.Tensor’>, <class ‘torch.Tensor’>]. When using
batched=True
, make sure provided function
returns a dict
of types like (<class 'list'>, <class 'numpy.ndarray'>)
.