Lab 2 Loading Dataset Fails; Cell 5 Dataset Errors

Kevin_Wharram · February 12, 2024, 8:49pm

Hi, do you perhaps have a link as I cant find it.

nik95 · February 12, 2024, 9:00pm

jermaineo · July 1, 2025, 8:01pm

I am experiencing this exact error. It has not been fixed by the course staff and don’t really know what to do here. Happens when triggering trainer.train()

---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
Cell In[71], line 1
----> 1 trainer.train()

File /opt/conda/lib/python3.12/site-packages/transformers/trainer.py:1624, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1622         hf_hub_utils.enable_progress_bars()
   1623 else:
-> 1624     return inner_training_loop(
   1625         args=args,
   1626         resume_from_checkpoint=resume_from_checkpoint,
   1627         trial=trial,
   1628         ignore_keys_for_eval=ignore_keys_for_eval,
   1629     )

File /opt/conda/lib/python3.12/site-packages/transformers/trainer.py:1879, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1876 model.zero_grad()
   1877 grad_norm: Optional[float] = None
-> 1879 self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
   1881 # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
   1882 if not args.ignore_data_skip:

File /opt/conda/lib/python3.12/site-packages/transformers/trainer_callback.py:370, in CallbackHandler.on_train_begin(self, args, state, control)
    368 def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
    369     control.should_training_stop = False
--> 370     return self.call_event("on_train_begin", args, state, control)

File /opt/conda/lib/python3.12/site-packages/transformers/trainer_callback.py:414, in CallbackHandler.call_event(self, event, args, state, control, **kwargs)
    412 def call_event(self, event, args, state, control, **kwargs):
    413     for callback in self.callbacks:
--> 414         result = getattr(callback, event)(
    415             args,
    416             state,
    417             control,
    418             model=self.model,
    419             tokenizer=self.tokenizer,
    420             optimizer=self.optimizer,
    421             lr_scheduler=self.lr_scheduler,
    422             train_dataloader=self.train_dataloader,
    423             eval_dataloader=self.eval_dataloader,
    424             **kwargs,
    425         )
    426         # A Callback can skip the return of `control` if it doesn't change it.
    427         if result is not None:

File /opt/conda/lib/python3.12/site-packages/transformers/integrations/integration_utils.py:1035, in MLflowCallback.on_train_begin(self, args, state, control, model, **kwargs)
   1033 def on_train_begin(self, args, state, control, model=None, **kwargs):
   1034     if not self._initialized:
-> 1035         self.setup(args, state, model)

File /opt/conda/lib/python3.12/site-packages/transformers/integrations/integration_utils.py:1004, in MLflowCallback.setup(self, args, state, model)
   1001 if self._experiment_name:
   1002     # Use of set_experiment() ensure that Experiment is created if not exists
   1003     self._ml_flow.set_experiment(self._experiment_name)
-> 1004 self._ml_flow.start_run(run_name=args.run_name, nested=self._nested_run)
   1005 logger.debug(f"MLflow run started with run_id={self._ml_flow.active_run().info.run_id}")
   1006 self._auto_end_run = True

File /opt/conda/lib/python3.12/site-packages/mlflow/tracking/fluent.py:446, in start_run(run_id, experiment_id, run_name, nested, parent_run_id, tags, description, log_system_metrics)
    442         user_specified_tags[MLFLOW_RUN_NAME] = run_name
    444     resolved_tags = context_registry.resolve_tags(user_specified_tags)
--> 446     active_run_obj = client.create_run(
    447         experiment_id=exp_id_for_run,
    448         tags=resolved_tags,
    449         run_name=run_name,
    450     )
    452 if log_system_metrics is None:
    453     # If `log_system_metrics` is not specified, we will check environment variable.
    454     log_system_metrics = MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING.get()

File /opt/conda/lib/python3.12/site-packages/mlflow/tracking/client.py:418, in MlflowClient.create_run(self, experiment_id, start_time, tags, run_name)
    364 def create_run(
    365     self,
    366     experiment_id: str,
   (...)
    369     run_name: Optional[str] = None,
    370 ) -> Run:
    371     """
    372     Create a :py:class:`mlflow.entities.Run` object that can be associated with
    373     metrics, parameters, artifacts, etc.
   (...)
    416         status: RUNNING
    417     """
--> 418     return self._tracking_client.create_run(experiment_id, start_time, tags, run_name)

File /opt/conda/lib/python3.12/site-packages/mlflow/tracking/_tracking_service/client.py:170, in TrackingServiceClient.create_run(self, experiment_id, start_time, tags, run_name)
    165 # Extract user from tags
    166 # This logic is temporary; the user_id attribute of runs is deprecated and will be removed
    167 # in a later release.
    168 user_id = tags.get(MLFLOW_USER, "unknown")
--> 170 return self.store.create_run(
    171     experiment_id=experiment_id,
    172     user_id=user_id,
    173     start_time=start_time or get_current_time_millis(),
    174     tags=[RunTag(key, value) for (key, value) in tags.items()],
    175     run_name=run_name,
    176 )

File /opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py:644, in FileStore.create_run(self, experiment_id, user_id, start_time, tags, run_name)
    640 """
    641 Creates a run with the specified attributes.
    642 """
    643 experiment_id = FileStore.DEFAULT_EXPERIMENT_ID if experiment_id is None else experiment_id
--> 644 experiment = self.get_experiment(experiment_id)
    645 if experiment is None:
    646     raise MlflowException(
    647         f"Could not create run under experiment with ID {experiment_id} - no such "
    648         "experiment exists.",
    649         databricks_pb2.RESOURCE_DOES_NOT_EXIST,
    650     )

File /opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py:453, in FileStore.get_experiment(self, experiment_id)
    442 """
    443 Fetch the experiment.
    444 Note: This API will search for active as well as deleted experiments.
   (...)
    450     A single Experiment object if it exists, otherwise raises an Exception.
    451 """
    452 experiment_id = FileStore.DEFAULT_EXPERIMENT_ID if experiment_id is None else experiment_id
--> 453 experiment = self._get_experiment(experiment_id)
    454 if experiment is None:
    455     raise MlflowException(
    456         f"Experiment '{experiment_id}' does not exist.",
    457         databricks_pb2.RESOURCE_DOES_NOT_EXIST,
    458     )

File /opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py:419, in FileStore._get_experiment(self, experiment_id, view_type)
    418 def _get_experiment(self, experiment_id, view_type=ViewType.ALL):
--> 419     self._check_root_dir()
    420     _validate_experiment_id(experiment_id)
    421     experiment_dir = self._get_experiment_path(experiment_id, view_type)

File /opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py:212, in FileStore._check_root_dir(self)
    208 """
    209 Run checks before running directory operations.
    210 """
    211 if not exists(self.root_directory):
--> 212     raise Exception(f"'{self.root_directory}' does not exist.")
    213 if not is_directory(self.root_directory):
    214     raise Exception(f"'{self.root_directory}' is not a directory.")

Exception: '/home/sagemaker-user/mlruns' does not exist.

gent.spah · July 2, 2025, 7:13am

Hello,

I ran the Lab and it works fine. Try to reopen the Lab from Coursera, it might reset the Lab. Also, you could try to reset it by deleting the current Lab file from the jupyter lab:

Right click and delete. And restore it from the terminal with the aws s3 cp --recursive s3://dlai-generative-ai/labs-202502/w2-170864 ./ command.

Topic		Replies	Views
Cannot complete Lab1 due to errors during execution Generative AI with Large Language Models lab-help	17	501	February 13, 2024
Cannot load data set from huggingface in section 2 Generative AI with Large Language Models week-module-3	4	378	March 28, 2024
Lab1 is not working Generative AI with Large Language Models week-module-1	2	829	February 11, 2024
Lab 1 Step 2 has some errors in Generative AI with Large Language Models week-module-1	7	316	February 11, 2024
Lab 1 issue with dataset error Generative AI with Large Language Models week-module-1	8	357	March 19, 2024

Lab 2 Loading Dataset Fails; Cell 5 Dataset Errors

Related topics