L6_Automatic_Speech_Recognition - Unable to load dataset from course's Jupyter notebook

1 Like

can post the screenshot of that error you got. if the error log is too long,take two screenshots and post.

There you go:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[3], line 1
----> 1 dataset = load_dataset("librispeech_asr",
      2                        split="train.clean.100",
      3                        streaming=True,
      4                        trust_remote_code=True)

File /usr/local/lib/python3.9/site-packages/datasets/load.py:2523, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
   2518 verification_mode = VerificationMode(
   2519     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   2520 )
   2522 # Create a dataset builder
-> 2523 builder_instance = load_dataset_builder(
   2524     path=path,
   2525     name=name,
   2526     data_dir=data_dir,
   2527     data_files=data_files,
   2528     cache_dir=cache_dir,
   2529     features=features,
   2530     download_config=download_config,
   2531     download_mode=download_mode,
   2532     revision=revision,
   2533     token=token,
   2534     storage_options=storage_options,
   2535     trust_remote_code=trust_remote_code,
   2536     _require_default_config_name=name is None,
   2537     **config_kwargs,
   2538 )
   2540 # Return iterable dataset in case of streaming
   2541 if streaming:

File /usr/local/lib/python3.9/site-packages/datasets/load.py:2195, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, use_auth_token, storage_options, trust_remote_code, _require_default_config_name, **config_kwargs)
   2193     download_config = download_config.copy() if download_config else DownloadConfig()
   2194     download_config.storage_options.update(storage_options)
-> 2195 dataset_module = dataset_module_factory(
   2196     path,
   2197     revision=revision,
   2198     download_config=download_config,
   2199     download_mode=download_mode,
   2200     data_dir=data_dir,
   2201     data_files=data_files,
   2202     cache_dir=cache_dir,
   2203     trust_remote_code=trust_remote_code,
   2204     _require_default_config_name=_require_default_config_name,
   2205     _require_custom_configs=bool(config_kwargs),
   2206 )
   2207 # Get dataset builder class from the processing script
   2208 builder_kwargs = dataset_module.builder_kwargs

File /usr/local/lib/python3.9/site-packages/datasets/load.py:1846, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)
   1841             if isinstance(e1, FileNotFoundError):
   1842                 raise FileNotFoundError(
   1843                     f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1844                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1845                 ) from None
-> 1846             raise e1 from None
   1847 else:
   1848     raise FileNotFoundError(
   1849         f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1850     )

File /usr/local/lib/python3.9/site-packages/datasets/load.py:1791, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)
   1787         raise DatasetNotFoundError(
   1788             msg + ". If the repo is private or gated, make sure to log in with `huggingface-cli login`."
   1789         )
   1790     else:
-> 1791         raise e
   1792 if filename in [sibling.rfilename for sibling in dataset_info.siblings]:  # contains a dataset script
   1793     fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)

File /usr/local/lib/python3.9/site-packages/datasets/load.py:1765, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)
   1763 hf_api = HfApi(config.HF_ENDPOINT)
   1764 try:
-> 1765     dataset_info = hf_api.dataset_info(
   1766         repo_id=path,
   1767         revision=revision,
   1768         token=download_config.token,
   1769         timeout=100.0,
   1770     )
   1771 except Exception as e:  # noqa catch any exception of hf_hub and consider that the dataset doesn't exist
   1772     if isinstance(
   1773         e,
   1774         (
   (...)
   1778         ),
   1779     ):

File /usr/local/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:118, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
    115 if check_use_auth_token:
    116     kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 118 return fn(*args, **kwargs)

File /usr/local/lib/python3.9/site-packages/huggingface_hub/hf_api.py:2150, in HfApi.dataset_info(self, repo_id, revision, timeout, files_metadata, token)
   2148 hf_raise_for_status(r)
   2149 data = r.json()
-> 2150 return DatasetInfo(**data)

File /usr/local/lib/python3.9/site-packages/huggingface_hub/hf_api.py:770, in DatasetInfo.__init__(self, **kwargs)
    768 self.likes = kwargs.pop("likes")
    769 self.paperswithcode_id = kwargs.pop("paperswithcode_id", None)
--> 770 self.tags = kwargs.pop("tags")
    771 card_data = kwargs.pop("cardData", None) or kwargs.pop("card_data", None)
    772 self.card_data = (
    773     DatasetCardData(**card_data, ignore_metadata_errors=True) if isinstance(card_data, dict) else card_data
    774 )

KeyError: 'tags'

The traceback indicates that the code is failing because the tags key is missing in the dataset information. The root cause could be due to an issue in the version of the datasets or huggingface_hub library you are using.

To overcome the problem just upgrade both libraries to the latest version. To sum up in the notebook just enclose in the beginning of the Notebook the following code:

%pip install --upgrade datasets huggingface_hub

For clearence, I have just forwarded you the screenshots:

Hoping this can help you.

2 Likes

Thanks for the help, I was able to run the dataset and run the notebook’s code. Someone should update the course’s notebook underlying set-up so that other learners don’t run into this issue.

@SGM

if the issue arise on course environment, then let me know. I will report to staff for update.

Thanks for reporting

1 Like

Yes, I’ve experienced this issue in the course environment, kindly refer to the screen capture in my original post.

@lesly.zerna please update this notebook for dataset version issue as it caused error on the course environment

1 Like

The fixed update has been implemented! You can try the notebook again!