Unable to run C1M2_Ungraded_lab2 in course “Retrieval Augmented Generation (RAG)” module 2.
This is the error I get: (after trying a few times):
Running: Module 2, Ungraded Lab 2: EXECUTABLE Cell: [2] under 1.1 DataSets
Lines:→>>from sklearn.datasets import fetch_20newsgroups
Load the 20 Newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset=‘train’, shuffle=True, random_state=42)
Convert the dataset to a DataFrame for easier handling
df = pd.DataFrame({
‘text’: newsgroups_train.data,
‘category’: newsgroups_train.target
}
Error:
--------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
Cell In[2], line 4
1 from sklearn.datasets import fetch_20newsgroups
3 # Load the 20 Newsgroups dataset
----> 4 newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
6 # Convert the dataset to a DataFrame for easier handling
7 df = pd.DataFrame({
8 'text': newsgroups_train.data,
9 'category': newsgroups_train.target
10 })
File /usr/local/lib/python3.11/site-packages/sklearn/utils/_param_validation.py:218, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
212 try:
213 with config_context(
214 skip_parameter_validation=(
215 prefer_skip_nested_validation or global_skip_validation
216 )
217 ):
--> 218 return func(*args, **kwargs)
219 except InvalidParameterError as e:
220 # When the function is just a wrapper around an estimator, we allow
221 # the function to delegate validation to the estimator, but we replace
222 # the name of the estimator by the name of the function in the error
223 # message to avoid confusion.
224 msg = re.sub(
225 r"parameter of \w+ must be",
226 f"parameter of {func.__qualname__} must be",
227 str(e),
228 )
File /usr/local/lib/python3.11/site-packages/sklearn/datasets/_twenty_newsgroups.py:320, in fetch_20newsgroups(data_home, subset, categories, shuffle, random_state, remove, download_if_missing, return_X_y, n_retries, delay)
318 if download_if_missing:
319 logger.info("Downloading 20news dataset. This may take a few minutes.")
--> 320 cache = _download_20newsgroups(
321 target_dir=twenty_home,
322 cache_path=cache_path,
323 n_retries=n_retries,
324 delay=delay,
325 )
326 else:
327 raise OSError("20Newsgroups dataset not found")
File /usr/local/lib/python3.11/site-packages/sklearn/datasets/_twenty_newsgroups.py:79, in _download_20newsgroups(target_dir, cache_path, n_retries, delay)
76 os.makedirs(target_dir, exist_ok=True)
78 logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url)
---> 79 archive_path = _fetch_remote(
80 ARCHIVE, dirname=target_dir, n_retries=n_retries, delay=delay
81 )
83 logger.debug("Decompressing %s", archive_path)
84 with tarfile.open(archive_path, "r:gz") as fp:
File /usr/local/lib/python3.11/site-packages/sklearn/datasets/_base.py:1512, in _fetch_remote(remote, dirname, n_retries, delay)
1510 while True:
1511 try:
-> 1512 urlretrieve(remote.url, temp_file_path)
1513 break
1514 except (URLError, TimeoutError):
File /usr/local/lib/python3.11/urllib/request.py:241, in urlretrieve(url, filename, reporthook, data)
224 """
225 Retrieve a URL into a temporary location on disk.
226
(...)
237 data file as well as the resulting HTTPMessage object.
238 """
239 url_type, path = _splittype(url)
--> 241 with contextlib.closing(urlopen(url, data)) as fp:
242 headers = fp.info()
244 # Just return the local path and the "headers" for file://
245 # URLs. No sense in performing a copy unless requested.
File /usr/local/lib/python3.11/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
214 else:
215 opener = _opener
--> 216 return opener.open(url, data, timeout)
File /usr/local/lib/python3.11/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout)
523 for processor in self.process_response.get(protocol, []):
524 meth = getattr(processor, meth_name)
--> 525 response = meth(req, response)
527 return response
File /usr/local/lib/python3.11/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response)
631 # According to RFC 2616, "2xx" code indicates that the client's
632 # request was successfully received, understood, and accepted.
633 if not (200 <= code < 300):
--> 634 response = self.parent.error(
635 'http', request, response, code, msg, hdrs)
637 return response
File /usr/local/lib/python3.11/urllib/request.py:563, in OpenerDirector.error(self, proto, *args)
561 if http_err:
562 args = (dict, 'default', 'http_error_default') + orig_args
--> 563 return self._call_chain(*args)
File /usr/local/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
494 for handler in handlers:
495 func = getattr(handler, meth_name)
--> 496 result = func(*args)
497 if result is not None:
498 return result
File /usr/local/lib/python3.11/urllib/request.py:643, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
642 def http_error_default(self, req, fp, code, msg, hdrs):
--> 643 raise HTTPError(req.full_url, code, msg, hdrs, fp)
HTTPError: HTTP Error 403: Forbidden

