How to feed Textract output into Comprehend Medical
Hi
I'm trying to modify this code at https://github.com/aws-samples/amazon-textract-enhancer/blob/master/functions/detect-text-postprocess-page.py
I want to be able to pipe the results of textract analysis into comprehend medical api
I've tried to hacking the code, unfortunately I'm not a python expert
Any ideas how best to approach this?
cheers
Jon
from textract_util import *
import io
import os
import json
import time
import boto3
def lambda_handler(event, context):
#Initialize Boto Resource
s3 = boto3.resource('s3')
textract = boto3.client('textract')
dynamodb = boto3.client('dynamodb')
table_name=os.environ\['table_name']
file_list = \[]
if "Records" in event:
records = event\['Records']
numRecords = len(records)
print("{} messages recieved".format(numRecords))
for record in records:
documentBlocks = None
num_pages = 0
num_lines = 0
bucket = ""
upload_prefix = ""
textractJobId = ""
textractStatus = ""
textractAPI = ""
textractJobTag = ""
textractS3ObjectName = ""
textractS3Bucket = ""
textractTimestamp = ""
if 'Sns' in record.keys():
sns = record\['Sns']
if 'Message' in sns.keys():
message = json.loads(sns\['Message'])
textractJobId = message\['JobId']
print("{} = {}".format("JobId", textractJobId))
textractStatus = message\['Status']
print("{} = {}".format("Status",textractStatus))
textractTimestamp = str(int(float(message\['Timestamp'])/1000))
print("{} = {}".format("Timestamp",textractTimestamp))
textractAPI = message\['API']
print("{} = {}".format("API", textractAPI))
textractJobTag = message\['JobTag']
print("{} = {}".format("JobTag", textractJobTag))
documentLocation = message\['DocumentLocation']
textractS3ObjectName = documentLocation\['S3ObjectName']
print("{} = {}".format("S3ObjectName", textractS3ObjectName))
textractS3Bucket = documentLocation\['S3Bucket']
print("{} = {}".format("S3Bucket", textractS3Bucket))
bucket = textractS3Bucket
document_path = textractS3ObjectName\[:textractS3ObjectName.rfind("/")] if textractS3ObjectName.find("/") >= 0 else ""
document_name = textractS3ObjectName\[textractS3ObjectName.rfind("/")+1:textractS3ObjectName.rfind(".")] if textractS3ObjectName.find("/") >= 0 else textractS3ObjectName\[:textractS3ObjectName.rfind(".")]
document_type = textractS3ObjectName\[textractS3ObjectName.rfind(".")+1:].upper()
if document_path == "":
upload_prefix = textractJobId
else:
upload_prefix = "{}/{}".format(document_path, textractJobId)
print("upload_prefix = " + upload_prefix)
num_pages, documentBlocks = GetTextDetectionResult(textract, textractJobId)
if documentBlocks is not None and len(documentBlocks) > 0:
print("{} Blocks retrieved".format(len(documentBlocks)))
#Extract lines of texts into a Python dictionary by parsing the raw JSON from Textract
blocks = groupBlocksByType(documentBlocks)
document_text, num_lines = extractTextBody(blocks)
#Generate JSON document using form fields information
json_document = "{}-text.json".format(document_name)
json_file = open("/tmp/"_json_document,'w_')
json_file.write(json.dumps(document_text, indent=4, sort_keys=True))
json_file.close()
s3.meta.client.upload_file("/tmp/"+json_document, bucket, "{}/{}".format(upload_prefix,json_document))
try:
response = dynamodb.update_item(
TableName=table_name,
Key={
'JobId':{'S':textractJobId},
'JobType':{'S':'TextDetection'}
},
ExpressionAttributeNames={"#tf": "TextFiles", "#jst": "JobStatus", "#jct": "JobCompleteTimeStamp", "#nl": "NumLines", "#np": "NumPages"},
UpdateExpression='SET #tf = list_append(#tf, :text_files), #jst = :job_status, #jct = :job_complete, #nl = :num_lines, #np = :num_pages',
ExpressionAttributeValues={
":text_files": {"L": \[{"S": "{}/{}".format(upload_prefix,json_document)}]},
":job_status": {"S": textractStatus},
":job_complete": {"N": str(textractTimestamp)},
":num_lines": {"N": str(num_lines)},
":num_pages": {"N": str(num_pages)}
}
)
except Exception as e:
print('DynamoDB Insertion Error is: {0}'.format(e))
else:
try:
response = dynamodb.update_item(
TableName=table_name,
Key={
'JobId':{'S':textractJobId},
'JobType':{'S':'TextDetection'}
},
ExpressionAttributeNames={"#jst": "JobStatus", "#jct": "JobCompleteTimeStamp"},
UpdateExpression='SET #jst = :job_status, #jct = :job_complete',
ExpressionAttributeValues={
":job_status": {"S": textractStatus},
":job_complete": {"N": str(textractTimestamp)}
}
)
except Exception as e:
print('DynamoDB Insertion Error is: {0}'.format(e))
s3_result = s3.meta.client.list_objects_v2(Bucket=bucket, Prefix="{}/".format(upload_prefix), Delimiter = "/")
if 'Contents' in s3_result:
for key in s3_result\['Contents']:
if key\['Key'].endswith("json"):
file_list.append("https://s3.amazonaws.com/{}/{}".format(bucket, key\['Key']))
while s3_result\['IsTruncated']:
continuation_key = s3_result\['NextContinuationToken']
s3_result = s3.meta.client.list_objects_v2(Bucket=bucket, Prefix="{}/".format(upload_prefix), Delimiter="/", ContinuationToken=continuation_key)
for key in s3_result\['Contents']:
if key\['Key'].endswith("json"):
file_list.append("https://s3.amazonaws.com/{}/{}".format(bucket, key\['Key']))
print(file_list)
return file_list
thanks
Jon
Hi,
Here is a link to some fairly straight-forward python code that takes an image, uses textract, and feeds it into Amazon Comprehend Medical. Look for the section "Natural language processing for medical documents"
https://aws.amazon.com/blogs/machine-learning/automatically-extract-text-and-structured-data-from-documents-with-amazon-textract/
In this example, use the following document to extract text using Amazon Textract. You then use Amazon Comprehend Medical to extract medical entities, such as medical condition, medication, dosage, strength, and protected health information (PHI).
The python code from that link is below:
import boto3
# Document
s3BucketName = "ki-textract-demo-docs"
documentName = "medical-notes.png"
# Amazon Textract client
textract = boto3.client('textract')
# Call Amazon Textract
response = textract.detect_document_text(
Document={
'S3Object': {
'Bucket': s3BucketName,
'Name': documentName
}
})
#print(response)
# Print text
print("\nText\n========")
text = ""
for item in response["Blocks"]:
if item["BlockType"] == "LINE":
print ('\033[94m' + item["Text"] + '\033[0m')
text = text + " " + item["Text"]
# Amazon Comprehend client
comprehend = boto3.client('comprehendmedical')
# Detect medical entities
entities = comprehend.detect_entities(Text=text)
print("\nMidical Entities\n========")
for entity in entities["Entities"]:
print("- {}".format(entity["Text"]))
print (" Type: {}".format(entity["Type"]))
print (" Category: {}".format(entity["Category"]))
if(entity["Traits"]):
print(" Traits:")
for trait in entity["Traits"]:
print (" - {}".format(trait["Name"]))
print("\n")
Hope this helps,
-randy
Relevant questions
Textract table extraction, splitting the table into two horizontal parts. How to get past this.
asked a month agoTextract to multi column pdf files
asked 2 days agoComprehend Medical Entity Detection Classification Accuracy
asked 3 years agoAWS Kendra - Search PDF with handwritten text
asked 21 days agoEmbed comprehend medical visualizer UI into browser-based (React) app
asked 3 years agoHow to extract key value pairs from Textract with A2I JSON output??
asked 4 months agoHow does textract determine when to segment text vertically or horizontally?
asked 4 months agoTextract - How to extract just certain fields
asked 3 months agoCustom entity annotation with groundtruth
asked 2 years agoHow to feed Textract output into Comprehend Medical
Accepted Answerasked 3 years ago