Hi, do you perhaps have a link as I cant find it.
1 Like
1 Like
I am experiencing this exact error. It has not been fixed by the course staff and don’t really know what to do here. Happens when triggering trainer.train()
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
Cell In[71], line 1
----> 1 trainer.train()
File /opt/conda/lib/python3.12/site-packages/transformers/trainer.py:1624, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1622 hf_hub_utils.enable_progress_bars()
1623 else:
-> 1624 return inner_training_loop(
1625 args=args,
1626 resume_from_checkpoint=resume_from_checkpoint,
1627 trial=trial,
1628 ignore_keys_for_eval=ignore_keys_for_eval,
1629 )
File /opt/conda/lib/python3.12/site-packages/transformers/trainer.py:1879, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1876 model.zero_grad()
1877 grad_norm: Optional[float] = None
-> 1879 self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
1881 # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
1882 if not args.ignore_data_skip:
File /opt/conda/lib/python3.12/site-packages/transformers/trainer_callback.py:370, in CallbackHandler.on_train_begin(self, args, state, control)
368 def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
369 control.should_training_stop = False
--> 370 return self.call_event("on_train_begin", args, state, control)
File /opt/conda/lib/python3.12/site-packages/transformers/trainer_callback.py:414, in CallbackHandler.call_event(self, event, args, state, control, **kwargs)
412 def call_event(self, event, args, state, control, **kwargs):
413 for callback in self.callbacks:
--> 414 result = getattr(callback, event)(
415 args,
416 state,
417 control,
418 model=self.model,
419 tokenizer=self.tokenizer,
420 optimizer=self.optimizer,
421 lr_scheduler=self.lr_scheduler,
422 train_dataloader=self.train_dataloader,
423 eval_dataloader=self.eval_dataloader,
424 **kwargs,
425 )
426 # A Callback can skip the return of `control` if it doesn't change it.
427 if result is not None:
File /opt/conda/lib/python3.12/site-packages/transformers/integrations/integration_utils.py:1035, in MLflowCallback.on_train_begin(self, args, state, control, model, **kwargs)
1033 def on_train_begin(self, args, state, control, model=None, **kwargs):
1034 if not self._initialized:
-> 1035 self.setup(args, state, model)
File /opt/conda/lib/python3.12/site-packages/transformers/integrations/integration_utils.py:1004, in MLflowCallback.setup(self, args, state, model)
1001 if self._experiment_name:
1002 # Use of set_experiment() ensure that Experiment is created if not exists
1003 self._ml_flow.set_experiment(self._experiment_name)
-> 1004 self._ml_flow.start_run(run_name=args.run_name, nested=self._nested_run)
1005 logger.debug(f"MLflow run started with run_id={self._ml_flow.active_run().info.run_id}")
1006 self._auto_end_run = True
File /opt/conda/lib/python3.12/site-packages/mlflow/tracking/fluent.py:446, in start_run(run_id, experiment_id, run_name, nested, parent_run_id, tags, description, log_system_metrics)
442 user_specified_tags[MLFLOW_RUN_NAME] = run_name
444 resolved_tags = context_registry.resolve_tags(user_specified_tags)
--> 446 active_run_obj = client.create_run(
447 experiment_id=exp_id_for_run,
448 tags=resolved_tags,
449 run_name=run_name,
450 )
452 if log_system_metrics is None:
453 # If `log_system_metrics` is not specified, we will check environment variable.
454 log_system_metrics = MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING.get()
File /opt/conda/lib/python3.12/site-packages/mlflow/tracking/client.py:418, in MlflowClient.create_run(self, experiment_id, start_time, tags, run_name)
364 def create_run(
365 self,
366 experiment_id: str,
(...)
369 run_name: Optional[str] = None,
370 ) -> Run:
371 """
372 Create a :py:class:`mlflow.entities.Run` object that can be associated with
373 metrics, parameters, artifacts, etc.
(...)
416 status: RUNNING
417 """
--> 418 return self._tracking_client.create_run(experiment_id, start_time, tags, run_name)
File /opt/conda/lib/python3.12/site-packages/mlflow/tracking/_tracking_service/client.py:170, in TrackingServiceClient.create_run(self, experiment_id, start_time, tags, run_name)
165 # Extract user from tags
166 # This logic is temporary; the user_id attribute of runs is deprecated and will be removed
167 # in a later release.
168 user_id = tags.get(MLFLOW_USER, "unknown")
--> 170 return self.store.create_run(
171 experiment_id=experiment_id,
172 user_id=user_id,
173 start_time=start_time or get_current_time_millis(),
174 tags=[RunTag(key, value) for (key, value) in tags.items()],
175 run_name=run_name,
176 )
File /opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py:644, in FileStore.create_run(self, experiment_id, user_id, start_time, tags, run_name)
640 """
641 Creates a run with the specified attributes.
642 """
643 experiment_id = FileStore.DEFAULT_EXPERIMENT_ID if experiment_id is None else experiment_id
--> 644 experiment = self.get_experiment(experiment_id)
645 if experiment is None:
646 raise MlflowException(
647 f"Could not create run under experiment with ID {experiment_id} - no such "
648 "experiment exists.",
649 databricks_pb2.RESOURCE_DOES_NOT_EXIST,
650 )
File /opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py:453, in FileStore.get_experiment(self, experiment_id)
442 """
443 Fetch the experiment.
444 Note: This API will search for active as well as deleted experiments.
(...)
450 A single Experiment object if it exists, otherwise raises an Exception.
451 """
452 experiment_id = FileStore.DEFAULT_EXPERIMENT_ID if experiment_id is None else experiment_id
--> 453 experiment = self._get_experiment(experiment_id)
454 if experiment is None:
455 raise MlflowException(
456 f"Experiment '{experiment_id}' does not exist.",
457 databricks_pb2.RESOURCE_DOES_NOT_EXIST,
458 )
File /opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py:419, in FileStore._get_experiment(self, experiment_id, view_type)
418 def _get_experiment(self, experiment_id, view_type=ViewType.ALL):
--> 419 self._check_root_dir()
420 _validate_experiment_id(experiment_id)
421 experiment_dir = self._get_experiment_path(experiment_id, view_type)
File /opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py:212, in FileStore._check_root_dir(self)
208 """
209 Run checks before running directory operations.
210 """
211 if not exists(self.root_directory):
--> 212 raise Exception(f"'{self.root_directory}' does not exist.")
213 if not is_directory(self.root_directory):
214 raise Exception(f"'{self.root_directory}' is not a directory.")
Exception: '/home/sagemaker-user/mlruns' does not exist.
1 Like
Hello,
I ran the Lab and it works fine. Try to reopen the Lab from Coursera, it might reset the Lab. Also, you could try to reset it by deleting the current Lab file from the jupyter lab:
Right click and delete. And restore it from the terminal with the aws s3 cp --recursive s3://dlai-generative-ai/labs-202502/w2-170864 ./ command.
1 Like
