Facing issue with huggingface load_dataset api when using github sample

0

Getting the following error and not sure what is wrong. The same works fine in my local macbook. My environment: Env: Python3.10 or pytorch_310

datasets: 2.10.1

Linux: Amazon Linux2 (Sagemaker Notebook Instance)

github to notebook: https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb

ValueError                                Traceback (most recent call last)
Cell In[10], line 2
      1 # load dataset
----> 2 dataset = load_dataset('imdb')
      4 # download tokenizer
      5 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

Stack trace:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1759, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
   1754 verification_mode = VerificationMode(
   1755     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1756 )
   1758 # Create a dataset builder
-> 1759 builder_instance = load_dataset_builder(
   1760     path=path,
   1761     name=name,
   1762     data_dir=data_dir,
   1763     data_files=data_files,
   1764     cache_dir=cache_dir,
   1765     features=features,
   1766     download_config=download_config,
   1767     download_mode=download_mode,
   1768     revision=revision,
   1769     use_auth_token=use_auth_token,
   1770     **config_kwargs,
   1771 )
   1773 # Return iterable dataset in case of streaming
   1774 if streaming:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1496, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   1494     download_config = download_config.copy() if download_config else DownloadConfig()
   1495     download_config.use_auth_token = use_auth_token
-> 1496 dataset_module = dataset_module_factory(
   1497     path,
   1498     revision=revision,
   1499     download_config=download_config,
   1500     download_mode=download_mode,
   1501     data_dir=data_dir,
   1502     data_files=data_files,
   1503 )
   1505 # Get dataset builder class from the processing script
   1506 builder_cls = import_main_class(dataset_module.module_path)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1218, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1213             if isinstance(e1, FileNotFoundError):
   1214                 raise FileNotFoundError(
   1215                     f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1216                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1217                 ) from None
-> 1218             raise e1 from None
   1219 else:
   1220     raise FileNotFoundError(
   1221         f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1222     )

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:1202, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1187         return HubDatasetModuleFactoryWithScript(
   1188             path,
   1189             revision=revision,
   (...)
   1192             dynamic_modules_path=dynamic_modules_path,
   1193         ).get_module()
   1194     else:
   1195         return HubDatasetModuleFactoryWithoutScript(
   1196             path,
   1197             revision=revision,
   1198             data_dir=data_dir,
   1199             data_files=data_files,
   1200             download_config=download_config,
   1201             download_mode=download_mode,
-> 1202         ).get_module()
   1203 except (
   1204     Exception
   1205 ) as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1206     try:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/load.py:767, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    756 def get_module(self) -> DatasetModule:
    757     hfh_dataset_info = hf_api_dataset_info(
    758         HfApi(config.HF_ENDPOINT),
    759         self.name,
   (...)
    762         timeout=100.0,
    763     )
    764     patterns = (
    765         sanitize_patterns(self.data_files)
    766         if self.data_files is not None
--> 767         else get_data_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
    768     )
    769     data_files = DataFilesDict.from_hf_repo(
    770         patterns,
    771         dataset_info=hfh_dataset_info,
    772         base_path=self.data_dir,
    773         allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    774     )
    775     module_names = {
    776         key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    777         for key, data_files_list in data_files.items()
    778     }

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:675, in get_data_patterns_in_dataset_repository(dataset_info, base_path)
    673 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
    674 try:
--> 675     return _get_data_files_patterns(resolver)
    676 except FileNotFoundError:
    677     raise EmptyDatasetError(
    678         f"The dataset repository at '{dataset_info.id}' doesn't contain any data files"
    679     ) from None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:236, in _get_data_files_patterns(pattern_resolver)
    234 try:
    235     for pattern in patterns:
--> 236         data_files = pattern_resolver(pattern)
    237         if len(data_files) > 0:
    238             non_empty_splits.append(split)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/datasets/data_files.py:486, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
    484 else:
    485     base_path = "/"
--> 486 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    487 matched_paths = [
    488     filepath
    489     for filepath in glob_iter
   (...)
    496     )
    497 ]  # ignore .ipynb and __pycache__, but keep /../
    498 if allowed_extensions is not None:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    602         depth = None
    604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    607 pattern = re.compile(pattern)
    609 out = {
    610     p: info
    611     for p, info in sorted(allpaths.items())
   (...)
    618     )
    619 }

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/fsspec/utils.py:734, in glob_translate(pat)
    732     continue
    733 elif "**" in part:
--> 734     raise ValueError(
    735         "Invalid pattern: '**' can only be an entire path component"
    736     )
    737 if part:
    738     results.extend(_translate(part, f"{not_sep}*", not_sep))

ValueError: Invalid pattern: '**' can only be an entire path component
profile pictureAWS
gefragt vor 2 Monaten110 Aufrufe
1 Antwort
0
Akzeptierte Antwort

Looks like I had to upgrade my datasets version to 2.18.0 to get passed this error.

profile pictureAWS
beantwortet vor 2 Monaten

Du bist nicht angemeldet. Anmelden um eine Antwort zu veröffentlichen.

Eine gute Antwort beantwortet die Frage klar, gibt konstruktives Feedback und fördert die berufliche Weiterentwicklung des Fragenstellers.

Richtlinien für die Beantwortung von Fragen