Pytorch Lightning Progress bar not working on Sagemaker Jupyter Lab
0
Hello!
We started using Sagemaker Jupyter Lab to run a few Depp Learning experiments we previously ran on GoogleColabPro+. The training starts fine and everything seems to work, however, the progress bar appears as follows:
Validation sanity check: 0it [00:00, ?it/s] Training: 0it [00:00, ?it/s]
The progress bar was working fine on GoogleColab. I tried uninstalling ipywidgets as suggested here, but still no luck. Anyone has an idea of how to fix the problem?
Below you will find a copy of the TrainerFunction I am using.
class T5FineTuner(pl.LightningModule):
def __init__(self, hparams):
super(T5FineTuner, self).__init__()
self.hparams = hparams
self.model = T5ForConditionalGeneration.from_pretrained(hparams['model_name_or_path'])
self.tokenizer = T5Tokenizer.from_pretrained(hparams['tokenizer_name_or_path'])
def hparams(self):
return self.hparams
def is_logger(self):
return True #self.trainer.proc_rank <= 0
def forward(
self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
):
return self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
labels=labels,
)
def _step(self, batch):
labels = batch["target_ids"]
labels[labels[:, :] == self.tokenizer.pad_token_id] = -100
outputs = self(
input_ids=batch["source_ids"],
attention_mask=batch["source_mask"],
labels=labels,
decoder_attention_mask=batch['target_mask']
)
loss = outputs[0]
return loss
def training_step(self, batch, batch_idx):
loss = self._step(batch)
tensorboard_logs = {"train_loss": loss}
return {"loss": loss, "log": tensorboard_logs}
def training_epoch_end(self, outputs):
avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
tensorboard_logs = {"avg_train_loss": avg_train_loss}
# return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
def validation_step(self, batch, batch_idx):
loss = self._step(batch)
return {"val_loss": loss}
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
tensorboard_logs = {"val_loss": avg_loss}
return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
def configure_optimizers(self):
"Prepare optimizer and schedule (linear warmup and decay)"
model = self.model
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": self.hparams['weight_decay'],
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams['learning_rate'], eps=self.hparams['adam_epsilon'])
self.opt = optimizer
return [optimizer]
def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_idx=None, optimizer_closure=None, second_order_closure=None, on_tpu=False, using_native_amp=False, using_lbfgs=False):
# if self.trainer.use_tpu:
# xm.optimizer_step(optimizer)
# else:
optimizer.step(closure=optimizer_closure)
optimizer.zero_grad()
self.lr_scheduler.step()
def get_tqdm_dict(self):
tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
return tqdm_dict
def train_dataloader(self):
train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="translated_train", args=self.hparams)
dataloader = DataLoader(train_dataset, batch_size=self.hparams['train_batch_size'], drop_last=True, shuffle=True,
num_workers=4)
t_total = (
(len(dataloader.dataset) // (self.hparams['train_batch_size'] * max(1, self.hparams['n_gpu'])))
// self.hparams['gradient_accumulation_steps']
* float(self.hparams['num_train_epochs'])
)
scheduler = get_linear_schedule_with_warmup(
self.opt, num_warmup_steps=self.hparams['warmup_steps'], num_training_steps=t_total
)
self.lr_scheduler = scheduler
return dataloader
def val_dataloader(self):
val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="test_2k", args=self.hparams)
return DataLoader(val_dataset, batch_size=self.hparams['eval_batch_size'], num_workers=4)
logger = logging.getLogger(__name__)
class LoggingCallback(pl.Callback):
def on_validation_end(self, trainer, pl_module):
logger.info("***** Validation results *****")
if pl_module.is_logger():
metrics = trainer.callback_metrics
# Log results
for key in sorted(metrics):
if key not in ["log", "progress_bar"]:
logger.info("{} = {}\n".format(key, str(metrics[key])))
def on_test_end(self, trainer, pl_module):
logger.info("***** Test results *****")
if pl_module.is_logger():
metrics = trainer.callback_metrics
# Log and save results to file
output_test_results_file = os.path.join(pl_module.hparams["output_dir"], "test_results.txt")
with open(output_test_results_file, "w") as writer:
for key in sorted(metrics):
if key not in ["log", "progress_bar"]:
logger.info("{} = {}\n".format(key, str(metrics[key])))
writer.write("{} = {}\n".format(key, str(metrics[key])))
preguntada hace 2 años1039 visualizacioneslg...
1 Respuesta
- Más nuevo
- Más votos
- Más comentarios
¿Son útiles estas respuestas? Vote a favor de la respuesta correcta para ayudar a la comunidad a beneficiarse de sus conocimientos.
0
Try installing it like this: %%capture import IPython import sys
!{sys.executable} -m pip install ipywidgets #IPython.Application.instance().kernel.do_shutdown(True) # has to restart kernel so changes are used
Contenido relevante
- OFICIAL DE AWSActualizada hace un año
- OFICIAL DE AWSActualizada hace un año
- OFICIAL DE AWSActualizada hace 2 años
- OFICIAL DE AWSActualizada hace un año