Questions tagged with Amazon SageMaker Model Training

Content language: English

Sort by most recent

Browse through the questions and answers listed below or filter and sort to narrow down your results.

Pytorch Lightning Progress bar not working on Sagemaker Jupyter Lab

Hello! We started using Sagemaker Jupyter Lab to run a few Depp Learning experiments we previously ran on GoogleColabPro+. The training starts fine and everything seems to work, however, the progress bar appears as follows: **Validation sanity check: 0it [00:00, ?it/s] Training: 0it [00:00, ?it/s]** The progress bar was working fine on GoogleColab. I tried uninstalling ipywidgets as [suggested here](https://github.com/PyTorchLightning/pytorch-lightning/issues/11208), but still no luck. Anyone has an idea of how to fix the problem? Below you will find a copy of the TrainerFunction I am using. ``` class T5FineTuner(pl.LightningModule): def __init__(self, hparams): super(T5FineTuner, self).__init__() self.hparams = hparams self.model = T5ForConditionalGeneration.from_pretrained(hparams['model_name_or_path']) self.tokenizer = T5Tokenizer.from_pretrained(hparams['tokenizer_name_or_path']) def hparams(self): return self.hparams def is_logger(self): return True #self.trainer.proc_rank <= 0 def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None ): return self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=labels, ) def _step(self, batch): labels = batch["target_ids"] labels[labels[:, :] == self.tokenizer.pad_token_id] = -100 outputs = self( input_ids=batch["source_ids"], attention_mask=batch["source_mask"], labels=labels, decoder_attention_mask=batch['target_mask'] ) loss = outputs[0] return loss def training_step(self, batch, batch_idx): loss = self._step(batch) tensorboard_logs = {"train_loss": loss} return {"loss": loss, "log": tensorboard_logs} def training_epoch_end(self, outputs): avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean() tensorboard_logs = {"avg_train_loss": avg_train_loss} # return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs} def validation_step(self, batch, batch_idx): loss = self._step(batch) return {"val_loss": loss} def validation_epoch_end(self, outputs): avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() tensorboard_logs = {"val_loss": avg_loss} return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs} def configure_optimizers(self): "Prepare optimizer and schedule (linear warmup and decay)" model = self.model no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.hparams['weight_decay'], }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams['learning_rate'], eps=self.hparams['adam_epsilon']) self.opt = optimizer return [optimizer] def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_idx=None, optimizer_closure=None, second_order_closure=None, on_tpu=False, using_native_amp=False, using_lbfgs=False): # if self.trainer.use_tpu: # xm.optimizer_step(optimizer) # else: optimizer.step(closure=optimizer_closure) optimizer.zero_grad() self.lr_scheduler.step() def get_tqdm_dict(self): tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]} return tqdm_dict def train_dataloader(self): train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="translated_train", args=self.hparams) dataloader = DataLoader(train_dataset, batch_size=self.hparams['train_batch_size'], drop_last=True, shuffle=True, num_workers=4) t_total = ( (len(dataloader.dataset) // (self.hparams['train_batch_size'] * max(1, self.hparams['n_gpu']))) // self.hparams['gradient_accumulation_steps'] * float(self.hparams['num_train_epochs']) ) scheduler = get_linear_schedule_with_warmup( self.opt, num_warmup_steps=self.hparams['warmup_steps'], num_training_steps=t_total ) self.lr_scheduler = scheduler return dataloader def val_dataloader(self): val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="test_2k", args=self.hparams) return DataLoader(val_dataset, batch_size=self.hparams['eval_batch_size'], num_workers=4) logger = logging.getLogger(__name__) class LoggingCallback(pl.Callback): def on_validation_end(self, trainer, pl_module): logger.info("***** Validation results *****") if pl_module.is_logger(): metrics = trainer.callback_metrics # Log results for key in sorted(metrics): if key not in ["log", "progress_bar"]: logger.info("{} = {}\n".format(key, str(metrics[key]))) def on_test_end(self, trainer, pl_module): logger.info("***** Test results *****") if pl_module.is_logger(): metrics = trainer.callback_metrics # Log and save results to file output_test_results_file = os.path.join(pl_module.hparams["output_dir"], "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(metrics): if key not in ["log", "progress_bar"]: logger.info("{} = {}\n".format(key, str(metrics[key]))) writer.write("{} = {}\n".format(key, str(metrics[key]))) ```
1
answers
0
votes
161
views
asked 6 months ago

IncompleteSignature error while using Sklearn SDK

Currently, we are trying to SK-Learn model from a python script running in a local computer by uploading data to S3 bucket. ``` from sagemaker.amazon.amazon_estimator import get_image_uri # container = retrieve(framework='sklearn', region='us-east-1', version="0.23-1") container = sagemaker.image_uris.get_training_image_uri('us-east-1', 'sklearn', framework_version='0.23-1') sklearn_estimator = SKLearn( entry_point="script.py", # # role=get_execution_role(), role = role_aws, instance_count=1, instance_type="ml.m5.4xlarge", framework_version=FRAMEWORK_VERSION, base_job_name="rf-scikit", metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}], hyperparameters={ "n-estimators": 100, "min-samples-leaf": 3, "features": "MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude", "target": "target", }, sagemaker_session=session, image_uri=container, image_uri_region='us-east-1', # output_path=model_output_path, ) # launch training job, with asynchronous call path_train_test = 's3://'+bucket_name+'/'+prefix sklearn_estimator.fit({"train": path_train_test, "test": path_train_test}, wait=False) ``` 'ClientError: An error occurred (IncompleteSignature) when calling the GetCallerIdentity operation: Credential must have exactly 5 slash-delimited elements, e.g. keyid/date/region/service/term, got 'https://elasticmapreduce.us-east-1b.amazonaws.com//20220406/us-east-1/sts/aws4_request' The access key and the secret key are passed through the session object via a client and passed to the SK-Learn estimator. ``` client_sagemaker = boto3.client('sagemaker', aws_access_key_id=accesskey , aws_secret_access_key=access_secret, ) session = sagemaker.Session(sagemaker_client =client_sagemaker ) ``` The same access key worked for Xgboost model (already available in sagemaker) Any ideas about the reason ?
0
answers
0
votes
23
views
asked 8 months ago

Is it possible to use smddp in notebook?

I recently tried the smddp v1.4.0 on SageMaker notebook instance (not sagemaker studio), using 8-GPU instances `ml.p3.16xlarge`, by directly using `smddp` as backend in the training scripts. I launched the estimator by setting `instance_type` to `local_gpu` and ended up with smddp error. Corresponding errors are attached below, saying an initialization error. ``` 42u1m0wni0-algo-1-36bbw | Traceback (most recent call last): 42u1m0wni0-algo-1-36bbw | File "true_main_notebook.py", line 636, in <module> 42u1m0wni0-algo-1-36bbw | main() 42u1m0wni0-algo-1-36bbw | File "true_main_notebook.py", line 178, in main 42u1m0wni0-algo-1-36bbw | dist.init_process_group(backend=args.dist_backend) 42u1m0wni0-algo-1-36bbw | File "/opt/conda/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 576, in init_process_group 42u1m0wni0-algo-1-36bbw | store, rank, world_size = next(rendezvous_iterator) 42u1m0wni0-algo-1-36bbw | File "/opt/conda/lib/python3.8/site-packages/torch/distributed/rendezvous.py", line 219, in _env_rendezvous_handler 42u1m0wni0-algo-1-36bbw | rank = int(_get_env_or_raise("RANK")) 42u1m0wni0-algo-1-36bbw | File "/opt/conda/lib/python3.8/site-packages/torch/distributed/rendezvous.py", line 203, in _get_env_or_raise 42u1m0wni0-algo-1-36bbw | raise _env_error(env_var) 42u1m0wni0-algo-1-36bbw | ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set 42u1m0wni0-algo-1-36bbw | Environment variable SAGEMAKER_INSTANCE_TYPE is not set 42u1m0wni0-algo-1-36bbw | Running smdistributed.dataparallel v1.4.0 42u1m0wni0-algo-1-36bbw | Error in atexit._run_exitfuncs: 42u1m0wni0-algo-1-36bbw | Traceback (most recent call last): 42u1m0wni0-algo-1-36bbw | File "/opt/conda/lib/python3.8/site-packages/smdistributed/dataparallel/torch/torch_smddp/__init__.py", line 51, in at_exit_smddp 42u1m0wni0-algo-1-36bbw | hm.shutdown() 42u1m0wni0-algo-1-36bbw | RuntimeError: Was this script started with smddprun? For more info on using smddprun, run smddprun -h 42u1m0wni0-algo-1-36bbw | 2022-04-03 16:07:30,005 sagemaker-training-toolkit ERROR Reporting training FAILURE 42u1m0wni0-algo-1-36bbw | 2022-04-03 16:07:30,005 sagemaker-training-toolkit ERROR ExecuteUserScriptError: 42u1m0wni0-algo-1-36bbw | ExitCode 1 42u1m0wni0-algo-1-36bbw | ErrorMessage "ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set 42u1m0wni0-algo-1-36bbw | Environment variable SAGEMAKER_INSTANCE_TYPE is not set Error in atexit._run_exitfuncs: Traceback (most recent call last): File "/opt/conda/lib/python3.8/site-packages/smdistributed/dataparallel/torch/torch_smddp/__init__.py", line 51, in at_exit_smddp hm.shutdown() RuntimeError: Was this script started with smddprun? For more info on using smddprun, run smddprun -h" ``` The original goal is to launch a single-node smddp for debugging. Does the smddp only support launched by AWS python SDK rather than the notebook? Or if something I've done is not correct?
0
answers
0
votes
29
views
yzs
asked 8 months ago

Invoking endpoint outputs empty prediction data

Hello, I am able to invoke my endpoint using the following command template: > aws --profile ‘insert_profile_name’ sagemaker-runtime invoke-endpoint --endpoint-name 'insert_endpoint_name' --body fileb://'insert_image_file_path' --region ‘insert_region’ --content-type application/x-image output.txt However, this produces an output text file that contains the following: > {prediction": []} Also, this appears in the terminal after running the command: > { "ContentType": "application/json", "InvokedProductionVariant": "variant-name-1" } The image I used to invoke my endpoint was also used for training the model. Here is my training job configuration (values that I've modified or added): > **Job Settings:** > Algorithm - Object Detection | Input Mode - Pipe > **Hyperparameters:** > num_classes - 1 | mini_batch_size - 1 | num_training_samples - 1 > **Input data configuration:** > *First channel:* > Name - validation | Input Mode - Pipe | Content Type - application/x-recordio | Record Wrapper - RecordIO | S3 Data Type - AugmentedManifestFile | Attribute Names - source-ref, bounding-box > *Second channel:* > Name - train | Input Mode - Pipe | Content Type - application/x-recordio | Record Wrapper - RecordIO | S3 Data Type - AugmentedManifestFile | Attribute Names - source-ref, bounding-box Any help would be appreciated. I can provide more information if needed. Thanks!
1
answers
0
votes
86
views
asked 8 months ago

How can I feed outputed augmented manifest file as input to blazingtext in a pipeline?

I'm creating a pipeline with multiple steps One to preprocess a dataset and the other one takes the preprocessed one as an input to train a BlazingText model for classification My first `ProcessingStep` outputs augmented manifest files step_process = ProcessingStep( name="Nab3Process", processor=sklearn_processor, inputs=[ ProcessingInput(source=raw_input_data, destination=raw_dir), ProcessingInput(source=categories_input_data, destination=categories_dir) ], outputs=[ ProcessingOutput(output_name="train", source=train_dir), ProcessingOutput(output_name="validation", source=validation_dir), ProcessingOutput(output_name="test", source=test_dir), ProcessingOutput(output_name="mlb_train", source=mlb_data_train_dir), ProcessingOutput(output_name="mlb_validation", source=mlb_data_validation_dir), ProcessingOutput(output_name="mlb_test", source=mlb_data_test_dir), ProcessingOutput(output_name="le_vectorizer", source=le_vectorizer_dir), ProcessingOutput(output_name="mlb_vectorizer", source=mlb_vectorizer_dir) ], code=preprocessing_dir) But I'm having a hard time when I try to feed my `train` output as a `TrainingInput` to the model step to use it to train. step_train = TrainingStep( name="Nab3Train", estimator=bt_train, inputs={ "train": TrainingInput( step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, distribution="FullyReplicated", content_type="application/x-recordio", s3_data_type='AugmentedManifestFile', attribute_names=['source', 'label'], input_mode='Pipe', record_wrapping='RecordIO' ), "validation": TrainingInput( step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, distribution="FullyReplicated", content_type='application/x-recordio', s3_data_type='AugmentedManifestFile', attribute_names=['source', 'label'], input_mode='Pipe', record_wrapping='RecordIO' ) }) And I'm getting the following error 'FailureReason': 'ClientError: Could not download manifest file with S3 URL "s3://sagemaker-us-east-1-xxxxxxxxxx/Nab3Process-xxxxxxxxxx/output/train". Please ensure that the bucket exists in the selected region (us-east-1), that the manifest file exists at that S3 URL, and that the role "arn:aws:iam::xxxxxxxxxx:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole" has "s3:GetObject" permissions on the manifest file. Error message from S3: The specified key does not exist.' What Should I do?
0
answers
0
votes
13
views
asked 8 months ago