By using AWS re:Post, you agree to the Terms of Use

How to feed Textract output into Comprehend Medical

0

Hi
I'm trying to modify this code at https://github.com/aws-samples/amazon-textract-enhancer/blob/master/functions/detect-text-postprocess-page.py

I want to be able to pipe the results of textract analysis into comprehend medical api
I've tried to hacking the code, unfortunately I'm not a python expert
Any ideas how best to approach this?

cheers
Jon

from textract_util import *
import io
import os
import json
import time
import boto3

def lambda_handler(event, context):

#Initialize Boto Resource	  
s3 = boto3.resource('s3')  
textract = boto3.client('textract')  
dynamodb = boto3.client('dynamodb')  
table_name=os.environ\['table_name']  
file_list = \[]  

if "Records" in event:          
    records = event\['Records']  
    numRecords = len(records)  

    print("{} messages recieved".format(numRecords))  
    for record in records:  
        documentBlocks = None  
        num_pages = 0       
        num_lines = 0  
        bucket = ""  
        upload_prefix = ""              
        textractJobId = ""  
        textractStatus = ""  
        textractAPI = ""  
        textractJobTag = ""  
        textractS3ObjectName = ""  
        textractS3Bucket = ""    
        textractTimestamp = ""              
        if 'Sns' in record.keys():  
            sns = record\['Sns']  
            if 'Message' in sns.keys():  
                message = json.loads(sns\['Message'])  
                textractJobId = message\['JobId']  
                print("{} = {}".format("JobId", textractJobId))  
                textractStatus = message\['Status']  
                print("{} = {}".format("Status",textractStatus))   
                textractTimestamp =  str(int(float(message\['Timestamp'])/1000))  
                print("{} = {}".format("Timestamp",textractTimestamp))                           
                textractAPI = message\['API']  
                print("{} = {}".format("API", textractAPI))                      
                textractJobTag = message\['JobTag']  
                print("{} = {}".format("JobTag", textractJobTag))      
                documentLocation = message\['DocumentLocation']  
                textractS3ObjectName = documentLocation\['S3ObjectName']  
                print("{} = {}".format("S3ObjectName", textractS3ObjectName))      
                textractS3Bucket = documentLocation\['S3Bucket']  
                print("{} = {}".format("S3Bucket", textractS3Bucket))        
                  
                bucket = textractS3Bucket  
                document_path = textractS3ObjectName\[:textractS3ObjectName.rfind("/")] if textractS3ObjectName.find("/") >= 0 else ""  
                document_name = textractS3ObjectName\[textractS3ObjectName.rfind("/")+1:textractS3ObjectName.rfind(".")] if textractS3ObjectName.find("/") >= 0 else textractS3ObjectName\[:textractS3ObjectName.rfind(".")]  
                document_type = textractS3ObjectName\[textractS3ObjectName.rfind(".")+1:].upper()                          

                if document_path == "":  
                    upload_prefix = textractJobId  
                else:  
                    upload_prefix = "{}/{}".format(document_path, textractJobId)  

                print("upload_prefix = " + upload_prefix)    

                num_pages, documentBlocks = GetTextDetectionResult(textract, textractJobId)   

        if documentBlocks is not None and len(documentBlocks) > 0:  
            print("{} Blocks retrieved".format(len(documentBlocks)))  
      
            #Extract lines of texts into a Python dictionary by parsing the raw JSON from Textract  
            blocks = groupBlocksByType(documentBlocks)  
            document_text, num_lines = extractTextBody(blocks)  

      
            #Generate JSON document using form fields information            
            json_document = "{}-text.json".format(document_name)  
            json_file = open("/tmp/"_json_document,'w_')  
            json_file.write(json.dumps(document_text, indent=4, sort_keys=True))  
            json_file.close()  
            s3.meta.client.upload_file("/tmp/"+json_document, bucket, "{}/{}".format(upload_prefix,json_document))           

            try:  
                response = dynamodb.update_item(  
                    TableName=table_name,  
                    Key={  
                        'JobId':{'S':textractJobId},  
                        'JobType':{'S':'TextDetection'}  
                    },  
                    ExpressionAttributeNames={"#tf": "TextFiles", "#jst": "JobStatus", "#jct": "JobCompleteTimeStamp", "#nl": "NumLines", "#np": "NumPages"},  
                    UpdateExpression='SET #tf = list_append(#tf, :text_files), #jst = :job_status, #jct = :job_complete, #nl = :num_lines, #np = :num_pages',  
                    ExpressionAttributeValues={  
                        ":text_files": {"L": \[{"S": "{}/{}".format(upload_prefix,json_document)}]},  
                        ":job_status": {"S": textractStatus},  
                        ":job_complete": {"N": str(textractTimestamp)},  
                        ":num_lines": {"N": str(num_lines)},  
                        ":num_pages": {"N": str(num_pages)}  
                    }  
                )  
            except Exception as e:  
                print('DynamoDB Insertion Error is: {0}'.format(e))                              
        else:  
            try:  
                response = dynamodb.update_item(  
                    TableName=table_name,  
                    Key={  
                        'JobId':{'S':textractJobId},  
                        'JobType':{'S':'TextDetection'}  
                    },  
                    ExpressionAttributeNames={"#jst": "JobStatus", "#jct": "JobCompleteTimeStamp"},  
                    UpdateExpression='SET #jst = :job_status, #jct = :job_complete',  
                    ExpressionAttributeValues={  
                        ":job_status": {"S": textractStatus},  
                        ":job_complete": {"N": str(textractTimestamp)}  
                    }  
                )  
            except Exception as e:  
                print('DynamoDB Insertion Error is: {0}'.format(e))    
                  
        s3_result = s3.meta.client.list_objects_v2(Bucket=bucket, Prefix="{}/".format(upload_prefix), Delimiter = "/")  
        if 'Contents' in s3_result:  
              
            for key in s3_result\['Contents']:  
                if key\['Key'].endswith("json"):  
                    file_list.append("https://s3.amazonaws.com/{}/{}".format(bucket, key\['Key']))  
              
            while s3_result\['IsTruncated']:  
                continuation_key = s3_result\['NextContinuationToken']  
                s3_result = s3.meta.client.list_objects_v2(Bucket=bucket, Prefix="{}/".format(upload_prefix), Delimiter="/", ContinuationToken=continuation_key)  
                for key in s3_result\['Contents']:  
                    if key\['Key'].endswith("json"):  
                        file_list.append("https://s3.amazonaws.com/{}/{}".format(bucket, key\['Key']))              
      
        print(file_list)     
      
return file_list  

thanks
Jon

asked 3 years ago123 views
1 Answer
0
Accepted Answer

Hi,

Here is a link to some fairly straight-forward python code that takes an image, uses textract, and feeds it into Amazon Comprehend Medical. Look for the section "Natural language processing for medical documents"
https://aws.amazon.com/blogs/machine-learning/automatically-extract-text-and-structured-data-from-documents-with-amazon-textract/

In this example, use the following document to extract text using Amazon Textract. You then use Amazon Comprehend Medical to extract medical entities, such as medical condition, medication, dosage, strength, and protected health information (PHI).

The python code from that link is below:

import boto3

# Document
s3BucketName = "ki-textract-demo-docs"
documentName = "medical-notes.png"

# Amazon Textract client
textract = boto3.client('textract')

# Call Amazon Textract
response = textract.detect_document_text(
    Document={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }
    })

#print(response)

# Print text
print("\nText\n========")
text = ""
for item in response["Blocks"]:
    if item["BlockType"] == "LINE":
        print ('\033[94m' +  item["Text"] + '\033[0m')
        text = text + " " + item["Text"]

# Amazon Comprehend client
comprehend = boto3.client('comprehendmedical')

# Detect medical entities
entities =  comprehend.detect_entities(Text=text)
print("\nMidical Entities\n========")
for entity in entities["Entities"]:
    print("- {}".format(entity["Text"]))
    print ("   Type: {}".format(entity["Type"]))
    print ("   Category: {}".format(entity["Category"]))
    if(entity["Traits"]):
        print("   Traits:")
        for trait in entity["Traits"]:
            print ("    - {}".format(trait["Name"]))
    print("\n")

Hope this helps,
-randy

answered 3 years ago

You are not logged in. Log in to post an answer.

A good answer clearly answers the question and provides constructive feedback and encourages professional growth in the question asker.

Guidelines for Answering Questions