Facing issue with huggingface load_dataset api when using github sample
0
Getting the following error and not sure what is wrong. The same works fine in my local macbook. My environment: Env: Python3.10 or pytorch_310
datasets: 2.10.1
Linux: Amazon Linux2 (Sagemaker Notebook Instance)
github to notebook: https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb
ValueError Traceback (most recent call last)
Cell In[10], line 2
1 # load dataset
----> 2 dataset = load_dataset('imdb')
4 # download tokenizer
5 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
Stack trace:
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1759, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
1754 verification_mode = VerificationMode(
1755 (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
1756 )
1758 # Create a dataset builder
-> 1759 builder_instance = load_dataset_builder(
1760 path=path,
1761 name=name,
1762 data_dir=data_dir,
1763 data_files=data_files,
1764 cache_dir=cache_dir,
1765 features=features,
1766 download_config=download_config,
1767 download_mode=download_mode,
1768 revision=revision,
1769 use_auth_token=use_auth_token,
1770 **config_kwargs,
1771 )
1773 # Return iterable dataset in case of streaming
1774 if streaming:
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1496, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
1494 download_config = download_config.copy() if download_config else DownloadConfig()
1495 download_config.use_auth_token = use_auth_token
-> 1496 dataset_module = dataset_module_factory(
1497 path,
1498 revision=revision,
1499 download_config=download_config,
1500 download_mode=download_mode,
1501 data_dir=data_dir,
1502 data_files=data_files,
1503 )
1505 # Get dataset builder class from the processing script
1506 builder_cls = import_main_class(dataset_module.module_path)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1218, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1213 if isinstance(e1, FileNotFoundError):
1214 raise FileNotFoundError(
1215 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
1216 f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
1217 ) from None
-> 1218 raise e1 from None
1219 else:
1220 raise FileNotFoundError(
1221 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
1222 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1202, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1187 return HubDatasetModuleFactoryWithScript(
1188 path,
1189 revision=revision,
(...)
1192 dynamic_modules_path=dynamic_modules_path,
1193 ).get_module()
1194 else:
1195 return HubDatasetModuleFactoryWithoutScript(
1196 path,
1197 revision=revision,
1198 data_dir=data_dir,
1199 data_files=data_files,
1200 download_config=download_config,
1201 download_mode=download_mode,
-> 1202 ).get_module()
1203 except (
1204 Exception
1205 ) as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
1206 try:
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:767, in HubDatasetModuleFactoryWithoutScript.get_module(self)
756 def get_module(self) -> DatasetModule:
757 hfh_dataset_info = hf_api_dataset_info(
758 HfApi(config.HF_ENDPOINT),
759 self.name,
(...)
762 timeout=100.0,
763 )
764 patterns = (
765 sanitize_patterns(self.data_files)
766 if self.data_files is not None
--> 767 else get_data_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
768 )
769 data_files = DataFilesDict.from_hf_repo(
770 patterns,
771 dataset_info=hfh_dataset_info,
772 base_path=self.data_dir,
773 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
774 )
775 module_names = {
776 key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
777 for key, data_files_list in data_files.items()
778 }
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:675, in get_data_patterns_in_dataset_repository(dataset_info, base_path)
673 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
674 try:
--> 675 return _get_data_files_patterns(resolver)
676 except FileNotFoundError:
677 raise EmptyDatasetError(
678 f"The dataset repository at '{dataset_info.id}' doesn't contain any data files"
679 ) from None
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:236, in _get_data_files_patterns(pattern_resolver)
234 try:
235 for pattern in patterns:
--> 236 data_files = pattern_resolver(pattern)
237 if len(data_files) > 0:
238 non_empty_splits.append(split)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:486, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
484 else:
485 base_path = "/"
--> 486 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
487 matched_paths = [
488 filepath
489 for filepath in glob_iter
(...)
496 )
497 ] # ignore .ipynb and __pycache__, but keep /../
498 if allowed_extensions is not None:
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
602 depth = None
604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
607 pattern = re.compile(pattern)
609 out = {
610 p: info
611 for p, info in sorted(allpaths.items())
(...)
618 )
619 }
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/fsspec/utils.py:734, in glob_translate(pat)
732 continue
733 elif "**" in part:
--> 734 raise ValueError(
735 "Invalid pattern: '**' can only be an entire path component"
736 )
737 if part:
738 results.extend(_translate(part, f"{not_sep}*", not_sep))
ValueError: Invalid pattern: '**' can only be an entire path component
asked 6 months ago225 viewslg...
1 Answer
- Newest
- Most votes
- Most comments
Are these answers helpful? Upvote the correct answer to help the community benefit from your knowledge.
0
Accepted Answer
Looks like I had to upgrade my datasets version to 2.18.0 to get passed this error.
answered 6 months agolg...
Relevant content
- asked a year agolg...
- asked 6 years agolg...
- asked 2 years agolg...
- Accepted Answerasked a year agolg...
- AWS OFFICIALUpdated 2 months ago
- AWS OFFICIALUpdated a year ago
- AWS OFFICIALUpdated 2 years ago