In Lab2 unable to load dataset

When creating a post, please add:

  • Week # must be added in the tags option of the post.
  • Link to the classroom item you are referring to:
  • Description (include relevant info but please do not post solution code or your entire notebook):
    I am in the middle of the lab and using coursera framework. I am unable to load the dataset. I am getting following error. Please help.

ValueError Traceback (most recent call last)
Cell In[16], line 3
1 huggingface_dataset_name = “knkarthick/dialogsum”
----> 3 dataset = load_dataset(huggingface_dataset_name)
5 dataset

File /opt/conda/lib/python3.12/site-packages/datasets/load.py:1704, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
1701 ignore_verifications = ignore_verifications or save_infos
1703 # Create a dataset builder
→ 1704 builder_instance = load_dataset_builder(
1705 path=path,
1706 name=name,
1707 data_dir=data_dir,
1708 data_files=data_files,
1709 cache_dir=cache_dir,
1710 features=features,
1711 download_config=download_config,
1712 download_mode=download_mode,
1713 revision=revision,
1714 use_auth_token=use_auth_token,
1715 **config_kwargs,
1716 )
1718 # Return iterable dataset in case of streaming
1719 if streaming:

File /opt/conda/lib/python3.12/site-packages/datasets/load.py:1530, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
1528 download_config = download_config.copy() if download_config else DownloadConfig()
1529 download_config.use_auth_token = use_auth_token
→ 1530 dataset_module = dataset_module_factory(
1531 path,
1532 revision=revision,
1533 download_config=download_config,
1534 download_mode=download_mode,
1535 data_dir=data_dir,
1536 data_files=data_files,
1537 )
1539 # Get dataset builder class from the processing script
1540 builder_cls = import_main_class(dataset_module.module_path)

File /opt/conda/lib/python3.12/site-packages/datasets/load.py:1282, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1277 if isinstance(e1, FileNotFoundError):
1278 raise FileNotFoundError(
1279 f"Couldn’t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
1280 f"Couldn’t find ‘{path}’ on the Hugging Face Hub either: {type(e1).name}: {e1}"
1281 ) from None
→ 1282 raise e1 from None
1283 else:
1284 raise FileNotFoundError(
1285 f"Couldn’t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
1286 )

File /opt/conda/lib/python3.12/site-packages/datasets/load.py:1270, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1255 return HubDatasetModuleFactoryWithScript(
1256 path,
1257 revision=revision,
(…)
1260 dynamic_modules_path=dynamic_modules_path,
1261 ).get_module()
1262 else:
1263 return HubDatasetModuleFactoryWithoutScript(
1264 path,
1265 revision=revision,
1266 data_dir=data_dir,
1267 data_files=data_files,
1268 download_config=download_config,
1269 download_mode=download_mode,
→ 1270 ).get_module()
1271 except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
1272 try:

File /opt/conda/lib/python3.12/site-packages/datasets/load.py:883, in HubDatasetModuleFactoryWithoutScript.get_module(self)
873 token = self.download_config.use_auth_token
874 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
875 self.name,
876 revision=self.revision,
877 token=token,
878 timeout=100.0,
879 )
880 patterns = (
881 sanitize_patterns(self.data_files)
882 if self.data_files is not None
→ 883 else get_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
884 )
885 data_files = DataFilesDict.from_hf_repo(
886 patterns,
887 dataset_info=hfh_dataset_info,
888 base_path=self.data_dir,
889 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
890 )
891 module_names = {
892 key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
893 for key, data_files_list in data_files.items()
894 }

File /opt/conda/lib/python3.12/site-packages/datasets/data_files.py:482, in get_patterns_in_dataset_repository(dataset_info, base_path)
480 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
481 try:
→ 482 return _get_data_files_patterns(resolver)
483 except FileNotFoundError:
484 raise FileNotFoundError(
485 f"The dataset repository at ‘{dataset_info.id}’ doesn’t contain any data file."
486 ) from None

File /opt/conda/lib/python3.12/site-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
97 try:
98 for pattern in patterns:
—> 99 data_files = pattern_resolver(pattern)
100 if len(data_files) > 0:
101 non_empty_splits.append(split)

File /opt/conda/lib/python3.12/site-packages/datasets/data_files.py:306, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
304 if base_path:
305 pattern = f"{base_path}/{pattern}"
→ 306 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
307 matched_paths = [
308 filepath
309 for filepath in glob_iter
310 if filepath.name not in data_files_ignore and not filepath.name.startswith(“.”)
311 ]
312 if allowed_extensions is not None:

File /opt/conda/lib/python3.12/site-packages/fsspec/spec.py:611, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
607 depth = None
609 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
→ 611 pattern = glob_translate(path + (“/” if ends_with_sep else “”))
612 pattern = re.compile(pattern)
614 out = {
615 p: info
616 for p, info in sorted(allpaths.items())
(…)
623 )
624 }

File /opt/conda/lib/python3.12/site-packages/fsspec/utils.py:729, in glob_translate(pat)
727 continue
728 elif “" in part:
→ 729 raise ValueError(
730 "Invalid pattern: '
’ can only be an entire path component”
731 )
732 if part:
733 results.extend(_translate(part, f"{not_sep}*", not_sep))

ValueError: Invalid pattern: ‘**’ can only be an entire path component

1 Like

Hi. The pip installs might have failed. I’ve escalated this issue to our partners. In the meantime, please manually edit the pip install block and use 4.38.2 for the transformers package instead of 4.27.2 . That worked for the labs I tested. Remember to restart the kernel before importing. Still waiting for updates. Thank you, and sorry for the inconvenience.

1 Like

Dear @SiddhiSingh1,

Have you tried the steps mentioned by Chris?

If you still face problem please send your notebook via personal message to me.

Hi everyone! I’m closing this thread so we can have one topic regarding this installation issue. It would be easier for everyone to manage feedback and monitor updates as we work on this. Please refer to the topic here. Thank you!