Hallo everyone,
in week3, lab, I have this error:
code:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType
trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler
import torch
import evaluate
import numpy as np
import pandas as pd
tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()
error message:
2025-02-21 15:12:56.585248: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-02-21 15:12:57.362821: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
another error:
code:
def build_dataset(model_name,
dataset_name,
input_min_text_length,
input_max_text_length):
"""
Preprocess the dataset and split it into train and test parts.
Parameters:
- model_name (str): Tokenizer model name.
- dataset_name (str): Name of the dataset to load.
- input_min_text_length (int): Minimum length of the dialogues.
- input_max_text_length (int): Maximum length of the dialogues.
Returns:
- dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
"""
# load dataset (only "train" part will be enough for this lab).
dataset = load_dataset(dataset_name, split="train")
# Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)
# Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
def tokenize(sample):
# Wrap each dialogue with the instruction.
prompt = f"""
Summarize the following conversation.
{sample[“dialogue”]}
Summary:
“”"
sample[“input_ids”] = tokenizer.encode(prompt)
# This must be called "query", which is a requirement of our PPO library.
sample["query"] = tokenizer.decode(sample["input_ids"])
return sample
# Tokenize each dialogue.
dataset = dataset.map(tokenize, batched=False)
dataset.set_format(type="torch")
# Split the dataset into train and test parts.
dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)
return dataset_splits
dataset = build_dataset(model_name=model_name,
dataset_name=huggingface_dataset_name,
input_min_text_length=200,
input_max_text_length=1000)
print(dataset)
error message:
def build_dataset(model_name,
dataset_name,
input_min_text_length,
input_max_text_length):
"""
Preprocess the dataset and split it into train and test parts.
Parameters:
- model_name (str): Tokenizer model name.
- dataset_name (str): Name of the dataset to load.
- input_min_text_length (int): Minimum length of the dialogues.
- input_max_text_length (int): Maximum length of the dialogues.
Returns:
- dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
"""
# load dataset (only "train" part will be enough for this lab).
dataset = load_dataset(dataset_name, split="train")
# Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)
# Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
def tokenize(sample):
# Wrap each dialogue with the instruction.
prompt = f"""
Summarize the following conversation.
{sample[“dialogue”]}
Summary:
“”"
sample[“input_ids”] = tokenizer.encode(prompt)
# This must be called "query", which is a requirement of our PPO library.
sample["query"] = tokenizer.decode(sample["input_ids"])
return sample
# Tokenize each dialogue.
dataset = dataset.map(tokenize, batched=False)
dataset.set_format(type="torch")
# Split the dataset into train and test parts.
dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)
return dataset_splits
dataset = build_dataset(model_name=model_name,
dataset_name=huggingface_dataset_name,
input_min_text_length=200,
input_max_text_length=1000)
print(dataset)
I did not change the code and reopened the file, still the error exists, and similar error happened in the following lines, so I guess if the first errors are sovled, then the follwoing line errors will disapear. what should I do? thank you!