Week3, lab trouble

Hallo everyone,
in week3, lab, I have this error:
code:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

trl: Transformer Reinforcement Learning library

from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

tqdm library makes the loops show a smart progress meter.

from tqdm import tqdm
tqdm.pandas()

error message:
2025-02-21 15:12:56.585248: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-02-21 15:12:57.362821: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT

another error:
code:
def build_dataset(model_name,
dataset_name,
input_min_text_length,
input_max_text_length):

"""
Preprocess the dataset and split it into train and test parts.

Parameters:
- model_name (str): Tokenizer model name.
- dataset_name (str): Name of the dataset to load.
- input_min_text_length (int): Minimum length of the dialogues.
- input_max_text_length (int): Maximum length of the dialogues.
    
Returns:
- dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
"""

# load dataset (only "train" part will be enough for this lab).
dataset = load_dataset(dataset_name, split="train")

# Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

# Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

def tokenize(sample):
    
    # Wrap each dialogue with the instruction.
    prompt = f"""

Summarize the following conversation.

{sample[“dialogue”]}

Summary:
“”"
sample[“input_ids”] = tokenizer.encode(prompt)

    # This must be called "query", which is a requirement of our PPO library.
    sample["query"] = tokenizer.decode(sample["input_ids"])
    return sample

# Tokenize each dialogue.
dataset = dataset.map(tokenize, batched=False)
dataset.set_format(type="torch")

# Split the dataset into train and test parts.
dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

return dataset_splits

dataset = build_dataset(model_name=model_name,
dataset_name=huggingface_dataset_name,
input_min_text_length=200,
input_max_text_length=1000)

print(dataset)

error message:
def build_dataset(model_name,
dataset_name,
input_min_text_length,
input_max_text_length):

"""
Preprocess the dataset and split it into train and test parts.

Parameters:
- model_name (str): Tokenizer model name.
- dataset_name (str): Name of the dataset to load.
- input_min_text_length (int): Minimum length of the dialogues.
- input_max_text_length (int): Maximum length of the dialogues.
    
Returns:
- dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
"""

# load dataset (only "train" part will be enough for this lab).
dataset = load_dataset(dataset_name, split="train")

# Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

# Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

def tokenize(sample):
    
    # Wrap each dialogue with the instruction.
    prompt = f"""

Summarize the following conversation.

{sample[“dialogue”]}

Summary:
“”"
sample[“input_ids”] = tokenizer.encode(prompt)

    # This must be called "query", which is a requirement of our PPO library.
    sample["query"] = tokenizer.decode(sample["input_ids"])
    return sample

# Tokenize each dialogue.
dataset = dataset.map(tokenize, batched=False)
dataset.set_format(type="torch")

# Split the dataset into train and test parts.
dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

return dataset_splits

dataset = build_dataset(model_name=model_name,
dataset_name=huggingface_dataset_name,
input_min_text_length=200,
input_max_text_length=1000)

print(dataset)

I did not change the code and reopened the file, still the error exists, and similar error happened in the following lines, so I guess if the first errors are sovled, then the follwoing line errors will disapear. what should I do? thank you!

I tired again, there is a new error.
Code:
def build_dataset(model_name,
dataset_name,
input_min_text_length,
input_max_text_length):

"""
Preprocess the dataset and split it into train and test parts.

Parameters:
- model_name (str): Tokenizer model name.
- dataset_name (str): Name of the dataset to load.
- input_min_text_length (int): Minimum length of the dialogues.
- input_max_text_length (int): Maximum length of the dialogues.
    
Returns:
- dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
"""

# load dataset (only "train" part will be enough for this lab).
dataset = load_dataset(dataset_name, split="train")

# Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

# Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

def tokenize(sample):
    
    # Wrap each dialogue with the instruction.
    prompt = f"""

Summarize the following conversation.

{sample[“dialogue”]}

Summary:
“”"
sample[“input_ids”] = tokenizer.encode(prompt)

    # This must be called "query", which is a requirement of our PPO library.
    sample["query"] = tokenizer.decode(sample["input_ids"])
    return sample

# Tokenize each dialogue.
dataset = dataset.map(tokenize, batched=False)
dataset.set_format(type="torch")

# Split the dataset into train and test parts.
dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

return dataset_splits

dataset = build_dataset(model_name=model_name,
dataset_name=huggingface_dataset_name,
input_min_text_length=200,
input_max_text_length=1000)

print(dataset)

error:
File /opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:369, in http_get(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar)
366 if resume_size > 0:
367 headers[“Range”] = "bytes=d-" (resume_size,)
→ 369 r = _request_wrapper(
370 method=“GET”, url=url, stream=True, proxies=proxies, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT
371 )
372 hf_raise_for_status(r)
373 content_length = r.headers.get(“Content-Length”)

File /opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:301, in _request_wrapper(method, url, follow_relative_redirects, **params)
298 return response
300 # Perform request and return if status_code is not in the retry list.
→ 301 response = get_session().request(method=method, url=url, **params)
302 hf_raise_for_status(response)
303 return response

File /opt/conda/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
584 send_kwargs = {
585 “timeout”: timeout,
586 “allow_redirects”: allow_redirects,
587 }
588 send_kwargs.update(settings)
→ 589 resp = self.send(prep, **send_kwargs)
591 return resp

File /opt/conda/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
700 start = preferred_clock()
702 # Send the request
→ 703 r = adapter.send(request, **kwargs)
705 # Total elapsed time of the request (approximately)
706 elapsed = preferred_clock() - start

File /opt/conda/lib/python3.11/site-packages/huggingface_hub/utils/_http.py:93, in UniqueRequestIdAdapter.send(self, request, *args, **kwargs)
91 “”“Catch any RequestException to append request id to the error message for debugging.”“”
92 try:
—> 93 return super().send(request, *args, **kwargs)
94 except requests.RequestException as e:
95 request_id = request.headers.get(X_AMZN_TRACE_ID)

File /opt/conda/lib/python3.11/site-packages/requests/adapters.py:713, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
711 raise SSLError(e, request=request)
712 elif isinstance(e, ReadTimeoutError):
→ 713 raise ReadTimeout(e, request=request)
714 elif isinstance(e, _InvalidHeader):
715 raise InvalidHeader(e, request=request)

ReadTimeout: (ReadTimeoutError(“HTTPSConnectionPool(host=‘cas-bridge.xethub.hf.co’, port=443): Read timed out. (read timeout=10)”), ‘(Request ID: 0e786071-5bcd-4ffb-b933-3ae5df6c0565)’)

Could you please check this thread

@Sonja999,
If you’re going to post your code, please use the “preformatted text” tag. This will prevent your code from looking like very confusing Markdown.

1 Like