I am unable to parse -the streaming output from the Llama2-7b-chat deployed on Sagemaker Jumpstart.
I have written a custom class to parse each event from the stream. However, the entire output is returned at once, instead of bytes array.
The code snippet is as shown below:
client = boto3.client("sagemaker-runtime")
request_body = {
"inputs":
[
[
{"role": "system", "content": "You are a bot designed to answer queries from user"},
{"role": "user", "content": "What are the must-visit attractions in Florida?"},
]
],
"parameters":{
"max_new_tokens":512,
"return_full_text": False
},
}
response = client.invoke_endpoint_with_response_stream(
EndpointName=endpoint_name,
Body=json.dumps(request_body),
ContentType="application/json",
CustomAttributes="accept_eula=true"
)
event_stream = response['Body']
for event in event_stream:
resp= event['PayloadPart']["Bytes"]
data_str = resp.decode('utf-8')[1:-1].replace("}","")
content_start = data_str.find('"content":')
if content_start != -1:
content = data_str[content_start + len('"content":'):].strip()
print(content)
else:
print(data_str)