When going through the Lab 1 of Week 1, I get the following error when trying to download the “dialogsum” dataset from Hugging Face:
---------------------------------------------------------------------------
HfHubHTTPError Traceback (most recent call last)
Cell In[5], line 3
1 huggingface_dataset_name = "knkarthick/dialogsum"
----> 3 dataset = load_dataset(huggingface_dataset_name)
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:2548, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2543 verification_mode = VerificationMode(
2544 (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
2545 )
2547 # Create a dataset builder
-> 2548 builder_instance = load_dataset_builder(
2549 path=path,
2550 name=name,
2551 data_dir=data_dir,
2552 data_files=data_files,
2553 cache_dir=cache_dir,
2554 features=features,
2555 download_config=download_config,
2556 download_mode=download_mode,
2557 revision=revision,
2558 token=token,
2559 storage_options=storage_options,
2560 trust_remote_code=trust_remote_code,
2561 _require_default_config_name=name is None,
2562 **config_kwargs,
2563 )
2565 # Return iterable dataset in case of streaming
2566 if streaming:
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:2220, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, use_auth_token, storage_options, trust_remote_code, _require_default_config_name, **config_kwargs)
2218 download_config = download_config.copy() if download_config else DownloadConfig()
2219 download_config.storage_options.update(storage_options)
-> 2220 dataset_module = dataset_module_factory(
2221 path,
2222 revision=revision,
2223 download_config=download_config,
2224 download_mode=download_mode,
2225 data_dir=data_dir,
2226 data_files=data_files,
2227 cache_dir=cache_dir,
2228 trust_remote_code=trust_remote_code,
2229 _require_default_config_name=_require_default_config_name,
2230 _require_custom_configs=bool(config_kwargs),
2231 )
2232 # Get dataset builder class from the processing script
2233 builder_kwargs = dataset_module.builder_kwargs
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1871, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)
1866 if isinstance(e1, FileNotFoundError):
1867 raise FileNotFoundError(
1868 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
1869 f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
1870 ) from None
-> 1871 raise e1 from None
1872 else:
1873 raise FileNotFoundError(
1874 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
1875 )
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1816, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)
1812 raise DatasetNotFoundError(
1813 msg + ". If the repo is private or gated, make sure to log in with `huggingface-cli login`."
1814 )
1815 else:
-> 1816 raise e
1817 if filename in [sibling.rfilename for sibling in dataset_info.siblings]: # contains a dataset script
1818 fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1790, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)
1788 hf_api = HfApi(config.HF_ENDPOINT)
1789 try:
-> 1790 dataset_info = hf_api.dataset_info(
1791 repo_id=path,
1792 revision=revision,
1793 token=download_config.token,
1794 timeout=100.0,
1795 )
1796 except Exception as e: # noqa catch any exception of hf_hub and consider that the dataset doesn't exist
1797 if isinstance(
1798 e,
1799 (
(...)
1803 ),
1804 ):
File /opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py:2446, in HfApi.dataset_info(self, repo_id, revision, timeout, files_metadata, expand, token)
2443 params["expand"] = expand
2445 r = get_session().get(path, headers=headers, timeout=timeout, params=params)
-> 2446 hf_raise_for_status(r)
2447 data = r.json()
2448 return DatasetInfo(**data)
File /opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:371, in hf_raise_for_status(response, endpoint_name)
367 raise HfHubHTTPError(message, response=response) from e
369 # Convert `HTTPError` into a `HfHubHTTPError` to display request information
370 # as well (request id and/or server error message)
--> 371 raise HfHubHTTPError(str(e), response=response) from e
HfHubHTTPError: 500 Server Error: Internal Server Error for url: https://huggingface.co/api/datasets/knkarthick/dialogsum (Request ID: Root=1-66a50c87-6669ce4f3caa2b9e619edf18;2aaf8638-dd3f-4433-abc0-f81d29e1b397)
Internal Error - We're working hard to fix this as soon as possible!
This seems to be an internal error of Hugging Face which has nothing to do with the course. However, I am unable to proceed with this lab until it gets resolved.
I understand that this is how things are sometimes, but after paying almost $70 for this course, I would expect for the datasets and models to be self-hosted. We are working with text after all, I would imagine the dataset for this particular lab can’t be that huge either.
In the meantime, I would like to know what is the policy regarding access to the AWS environment. Can anyone tell me if I can access it anytime I want? Or do I lose access to it after completing the course?
Thanks in advance!