Facing issue with huggingface load_dataset api when using github sample

0

Getting the following error and not sure what is wrong. The same works fine in my local macbook. My environment: Env: Python3.10 or pytorch_310

datasets: 2.10.1

Linux: Amazon Linux2 (Sagemaker Notebook Instance)

github to notebook: https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb

ValueError                                Traceback (most recent call last)
Cell In[10], line 2
      1 # load dataset
----> 2 dataset = load_dataset('imdb')
      4 # download tokenizer
      5 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

Stack trace:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1759, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
   1754 verification_mode = VerificationMode(
   1755     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1756 )
   1758 # Create a dataset builder
-> 1759 builder_instance = load_dataset_builder(
   1760     path=path,
   1761     name=name,
   1762     data_dir=data_dir,
   1763     data_files=data_files,
   1764     cache_dir=cache_dir,
   1765     features=features,
   1766     download_config=download_config,
   1767     download_mode=download_mode,
   1768     revision=revision,
   1769     use_auth_token=use_auth_token,
   1770     **config_kwargs,
   1771 )
   1773 # Return iterable dataset in case of streaming
   1774 if streaming:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1496, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   1494     download_config = download_config.copy() if download_config else DownloadConfig()
   1495     download_config.use_auth_token = use_auth_token
-> 1496 dataset_module = dataset_module_factory(
   1497     path,
   1498     revision=revision,
   1499     download_config=download_config,
   1500     download_mode=download_mode,
   1501     data_dir=data_dir,
   1502     data_files=data_files,
   1503 )
   1505 # Get dataset builder class from the processing script
   1506 builder_cls = import_main_class(dataset_module.module_path)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1218, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1213             if isinstance(e1, FileNotFoundError):
   1214                 raise FileNotFoundError(
   1215                     f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1216                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1217                 ) from None
-> 1218             raise e1 from None
   1219 else:
   1220     raise FileNotFoundError(
   1221         f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1222     )

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1202, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1187         return HubDatasetModuleFactoryWithScript(
   1188             path,
   1189             revision=revision,
   (...)
   1192             dynamic_modules_path=dynamic_modules_path,
   1193         ).get_module()
   1194     else:
   1195         return HubDatasetModuleFactoryWithoutScript(
   1196             path,
   1197             revision=revision,
   1198             data_dir=data_dir,
   1199             data_files=data_files,
   1200             download_config=download_config,
   1201             download_mode=download_mode,
-> 1202         ).get_module()
   1203 except (
   1204     Exception
   1205 ) as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1206     try:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:767, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    756 def get_module(self) -> DatasetModule:
    757     hfh_dataset_info = hf_api_dataset_info(
    758         HfApi(config.HF_ENDPOINT),
    759         self.name,
   (...)
    762         timeout=100.0,
    763     )
    764     patterns = (
    765         sanitize_patterns(self.data_files)
    766         if self.data_files is not None
--> 767         else get_data_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
    768     )
    769     data_files = DataFilesDict.from_hf_repo(
    770         patterns,
    771         dataset_info=hfh_dataset_info,
    772         base_path=self.data_dir,
    773         allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    774     )
    775     module_names = {
    776         key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    777         for key, data_files_list in data_files.items()
    778     }

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:675, in get_data_patterns_in_dataset_repository(dataset_info, base_path)
    673 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
    674 try:
--> 675     return _get_data_files_patterns(resolver)
    676 except FileNotFoundError:
    677     raise EmptyDatasetError(
    678         f"The dataset repository at '{dataset_info.id}' doesn't contain any data files"
    679     ) from None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:236, in _get_data_files_patterns(pattern_resolver)
    234 try:
    235     for pattern in patterns:
--> 236         data_files = pattern_resolver(pattern)
    237         if len(data_files) > 0:
    238             non_empty_splits.append(split)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:486, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
    484 else:
    485     base_path = "/"
--> 486 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    487 matched_paths = [
    488     filepath
    489     for filepath in glob_iter
   (...)
    496     )
    497 ]  # ignore .ipynb and __pycache__, but keep /../
    498 if allowed_extensions is not None:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    602         depth = None
    604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    607 pattern = re.compile(pattern)
    609 out = {
    610     p: info
    611     for p, info in sorted(allpaths.items())
   (...)
    618     )
    619 }

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/fsspec/utils.py:734, in glob_translate(pat)
    732     continue
    733 elif "**" in part:
--> 734     raise ValueError(
    735         "Invalid pattern: '**' can only be an entire path component"
    736     )
    737 if part:
    738     results.extend(_translate(part, f"{not_sep}*", not_sep))

ValueError: Invalid pattern: '**' can only be an entire path component
profile pictureAWS
asked a month ago102 views
1 Answer
0
Accepted Answer

Looks like I had to upgrade my datasets version to 2.18.0 to get passed this error.

profile pictureAWS
answered a month ago

You are not logged in. Log in to post an answer.

A good answer clearly answers the question and provides constructive feedback and encourages professional growth in the question asker.

Guidelines for Answering Questions