Lab1 is not working

Lab 1 , is not working

huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

Getting this error,

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[13], line 2
      1 huggingface_dataset_name = "knkarthick/dialogsum"
----> 2 dataset = load_dataset(huggingface_dataset_name)

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1767, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   1762 verification_mode = VerificationMode(
   1763     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1764 )
   1766 # Create a dataset builder
-> 1767 builder_instance = load_dataset_builder(
   1768     path=path,
   1769     name=name,
   1770     data_dir=data_dir,
   1771     data_files=data_files,
   1772     cache_dir=cache_dir,
   1773     features=features,
   1774     download_config=download_config,
   1775     download_mode=download_mode,
   1776     revision=revision,
   1777     use_auth_token=use_auth_token,
   1778     storage_options=storage_options,
   1779     **config_kwargs,
   1780 )
   1782 # Return iterable dataset in case of streaming
   1783 if streaming:

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1498, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, storage_options, **config_kwargs)
   1496     download_config = download_config.copy() if download_config else DownloadConfig()
   1497     download_config.use_auth_token = use_auth_token
-> 1498 dataset_module = dataset_module_factory(
   1499     path,
   1500     revision=revision,
   1501     download_config=download_config,
   1502     download_mode=download_mode,
   1503     data_dir=data_dir,
   1504     data_files=data_files,
   1505 )
   1507 # Get dataset builder class from the processing script
   1508 builder_cls = import_main_class(dataset_module.module_path)

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1215, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1210             if isinstance(e1, FileNotFoundError):
   1211                 raise FileNotFoundError(
   1212                     f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1213                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1214                 ) from None
-> 1215             raise e1 from None
   1216 else:
   1217     raise FileNotFoundError(
   1218         f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1219     )

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1199, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1184         return HubDatasetModuleFactoryWithScript(
   1185             path,
   1186             revision=revision,
   (...)
   1189             dynamic_modules_path=dynamic_modules_path,
   1190         ).get_module()
   1191     else:
   1192         return HubDatasetModuleFactoryWithoutScript(
   1193             path,
   1194             revision=revision,
   1195             data_dir=data_dir,
   1196             data_files=data_files,
   1197             download_config=download_config,
   1198             download_mode=download_mode,
-> 1199         ).get_module()
   1200 except (
   1201     Exception
   1202 ) as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1203     try:

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:765, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    755 def get_module(self) -> DatasetModule:
    756     hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
    757         self.name,
    758         revision=self.revision,
    759         token=self.download_config.use_auth_token,
    760         timeout=100.0,
    761     )
    762     patterns = (
    763         sanitize_patterns(self.data_files)
    764         if self.data_files is not None
--> 765         else get_data_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
    766     )
    767     data_files = DataFilesDict.from_hf_repo(
    768         patterns,
    769         dataset_info=hfh_dataset_info,
    770         base_path=self.data_dir,
    771         allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    772     )
    773     module_names = {
    774         key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    775         for key, data_files_list in data_files.items()
    776     }

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:675, in get_data_patterns_in_dataset_repository(dataset_info, base_path)
    673 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
    674 try:
--> 675     return _get_data_files_patterns(resolver)
    676 except FileNotFoundError:
    677     raise EmptyDatasetError(
    678         f"The dataset repository at '{dataset_info.id}' doesn't contain any data files"
    679     ) from None

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:236, in _get_data_files_patterns(pattern_resolver)
    234 try:
    235     for pattern in patterns:
--> 236         data_files = pattern_resolver(pattern)
    237         if len(data_files) > 0:
    238             non_empty_splits.append(split)

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:486, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
    484 else:
    485     base_path = "/"
--> 486 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    487 matched_paths = [
    488     filepath
    489     for filepath in glob_iter
   (...)
    496     )
    497 ]  # ignore .ipynb and __pycache__, but keep /../
    498 if allowed_extensions is not None:

File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    602         depth = None
    604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    607 pattern = re.compile(pattern)
    609 out = {
    610     p: info
    611     for p, info in sorted(allpaths.items())
   (...)
    618     )
    619 }

File /opt/conda/lib/python3.10/site-packages/fsspec/utils.py:734, in glob_translate(pat)
    732     continue
    733 elif "**" in part:
--> 734     raise ValueError(
    735         "Invalid pattern: '**' can only be an entire path component"
    736     )
    737 if part:
    738     results.extend(_translate(part, f"{not_sep}*", not_sep))

ValueError: Invalid pattern: '**' can only be an entire path component
1 Like

Facing same error…

Solution

Step 1
Restart Kernel
Step 2 - update Dataset version
%pip install \ transformers==4.27.2 \ datasets==2.17.0 --quiet
Also run
%pip install -U datasets

This solved the problem for me.

1 Like