Tokenizer Error on batched=True When Using Different Cloud Service

def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets =, batched=True)

TypeError: Provided function which is applied to all elements of table returns a dict of types [<class ‘list’>,
<class ‘list’>, <class ‘list’>, <class ‘list’>, <class ‘torch.Tensor’>, <class ‘torch.Tensor’>]. When using
batched=True, make sure provided function returns a dict of types like (<class 'list'>, <class 'numpy.ndarray'>).

Use this instead:

def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids.numpy()
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids.numpy()
    return example