I am trying to run inference of GPT2 on inf2 instance using this transformers-neuronx example:
https://github.com/aws-neuron/transformers-neuronx#hugging-face-generate-api-support
I keep getting the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[1], line 14
12 # Create and compile the Neuron model
13 model_neuron = GPT2ForSampling.from_pretrained('gpt2-split', batch_size=1, tp_degree=2, n_positions=256, amp='f32', unroll=None)
---> 14 model_neuron.to_neuron()
16 # Use the `HuggingFaceGenerationModelAdapter` to access the generate API
17 model = HuggingFaceGenerationModelAdapter(model_cpu.config, model_neuron)
File ~/infenv/lib/python3.8/site-packages/transformers_neuronx/gpt2/model.py:117, in GPT2ForSampling.to_neuron(self)
115 self.decoder_lm_head.add_lm_head(lm_head.weight.detach().T)
116 lm_head.nullify()
--> 117 self.decoder_lm_head.to_neuron()
118 # We need to reset once, since there might be NaN initially in KVcache.
119 # This is done right after weight loading which is shared for different generation methods.
120 self.reset()
File ~/infenv/lib/python3.8/site-packages/transformers_neuronx/decoder.py:121, in DecoderLmHeadForSamplingNoEmbedding.to_neuron(self)
118 ln_lm_head_params.append(self.lm_head_bias)
120 self.program = self._build_program()
--> 121 self.program.setup(self.layers, ln_lm_head_params)
File ~/infenv/lib/python3.8/site-packages/transformers_neuronx/decoder.py:872, in DecoderProgramFullyUnrolled.setup(self, layers, ln_lm_head_params)
871 def setup(self, layers, ln_lm_head_params):
--> 872 super().setup(layers, ln_lm_head_params)
873 for npos, memory in zip(self.n_positions_list, self.memories):
874 input_tensors = [*self.input_buffers]
File ~/infenv/lib/python3.8/site-packages/transformers_neuronx/decoder.py:827, in DecoderProgram.setup(self, layers, ln_lm_head_params)
824 executor.submit(kernel.build, bucket_size)
826 for kernel in self.kernels:
--> 827 kernel.load()
File ~/infenv/lib/python3.8/site-packages/transformers_neuronx/compiler.py:376, in ParallelKernel.load(self)
375 def load(self):
--> 376 self.model = torch.classes.neuron.ParallelModel(self.neff_bytes, self.tp_degree, self.g_start_device_id, self.g_device_count)
377 self.model.load()
RuntimeError: __init__() expected at most 3 argument(s) but received 5 argument(s). Declaration: __init__(__torch__.torch.classes.neuron.ParallelModel _0, str _1, int _2) -> NoneType _0