Can someone please explain what I'm doing wrong here. Thank you. Textract Related

0

GOAL: I want to be able to increase our ability to textract as many documents as we can because being capped at 15 is a joke and won't work.


HERE IS MY CODE:

import boto3 import os import time import psycopg2 import threading

s3_client = boto3.client("s3") textract_client = boto3.client("textract") class Extract_Text:

def __init__(self, list_of_records, database_connection, pdf_number, document_type):

    #self.type_of_query = type_of_query
    self.database_connection = database_connection
    self.records = list_of_records
    self.part_number = pdf_number
    self.document_type = document_type
    '''
    cursor = self.database_connection.cursor()
    if self.type_of_query == "property":
        cursor.execute("""Select rds_uid from property_information where (cfn is not null) and (cfn != 'Unavailable') and (textract is null)""") # Make sure not unavailable
    if self.type_of_query == "liens":
        cursor.execute("""Select rds_uid from liens where (textract is null)""") # Make sure not unavailable
    self.records = cursor.fetchall()
    cursor.close()
    '''
    print("init done")
    self.textract_all_records()

def textract_all_records(self):
    for num,rds_uid in enumerate(self.records):
        textract_client = boto3.client("textract")
        rds_uid = rds_uid[0]
        full_text = self.scan_pdf(rds_uid, textract_client)
        self.upload_data(full_text,rds_uid)
        print(f"done {self.part_number} - {rds_uid}")
        textract_client.close()
        #time.sleep(10000)

def scan_pdf(self, rds_uid, textract_client): # Scans the LN PDF for PDF Keywords
    try:
        print(rds_uid)
        full_text = self.return_text(rds_uid, textract_client)
        return full_text
    except Exception as e:
        print(e)
        time.sleep(10000)


def return_text(self, rds_uid, textract_client):
    prop_pdf = s3_client.get_object(Bucket="gofundingfinal", Key=f"Florida/PDFS/{rds_uid}.pdf")["Body"].read()
    with open(f"temp{self.part_number}.pdf", "wb") as f:
        f.write(prop_pdf)
    
    with open(f"temp{self.part_number}.pdf", "rb") as f:
        pdf_bytes = f.read()
        s3_client.put_object(
            Body=pdf_bytes,
            Bucket="pdfreading",
            Key=f"{rds_uid}.pdf"
        )
    #time.sleep(10000)

    while True:
        try:
            job_id = textract_client.start_document_text_detection(
                DocumentLocation={
                    "S3Object" : {
                        "Bucket" : "pdfreading",
                        "Name" : f"{rds_uid}.pdf"
                    },
                }
            )["JobId"]
            break
        except Exception as e:
            pass

    response = textract_client.get_document_text_detection(
        JobId=job_id
    )
    print("In progress")
    while response["JobStatus"] == "IN_PROGRESS":
        try:
            #print(response["JobStatus"])
            response = textract_client.get_document_text_detection(
                JobId=job_id
            )
        except Exception as e:
            print(e)
            print("we are ok!")
            time.sleep(1)
    
    print("done")
    all_text = ""
    while True:
        try:
            #print(response["NextToken"])
            #time.sleep(10000)
            response_blocks = response["Blocks"]
            for text in response_blocks:
                try:
                    all_text += text["Text"] + " "
                except:
                    pass
            response = textract_client.get_document_text_detection(
                JobId=job_id,
                NextToken=response["NextToken"]
            )
            print(response["NextToken"])
        except:
            break

    s3_client.delete_object(Bucket="pdfreading", Key=f"{rds_uid}.pdf")
    while f"temp{self.part_number}.pdf" in os.listdir(os.getcwd()):
        try:
            os.remove(f"temp{self.part_number}.pdf")
            break
        except:
            pass
    return all_text

'''
12011117880550
An error occurred (ProvisionedThroughputExceededException) when calling the GetDocumentTextDetection operation (reached max retries: 4): Provisioned rate exceeded
'''
def upload_data(self, textract_data, rds_uid):
    print("f")
    database_cursor = self.database_connection.cursor()
    print("fu")
    print(textract_data)
    '''
    if self.type_of_query == "property":#12011117596827 12011117596827
        insert_data = f"""
        update property_information set (textract,rds_uid) = (%s,%s) where (rds_uid = '{rds_uid}');
        """
    if self.type_of_query == "liens":
        insert_data = f"""
        update liens set (textract,rds_uid) = (%s,%s) where (rds_uid = '{rds_uid}');
        """
    '''
    insert_data = f"""
        update {document_type} set (textract,rds_uid) = (%s,%s) where (rds_uid = '{rds_uid}');
        """
    while True:
        try:
            database_cursor.execute(insert_data, (textract_data,rds_uid)) #!!
            break
        except Exception as e:
            print(e)
            time.sleep(1)
    self.database_connection.commit()
    database_cursor.close()

pick either leins or lis_pendens and enter into document type

document_type = "lis_pendens" # liens|lis_pendens

cursor = database_connection.cursor() #cursor.execute("""Select rds_uid from property_information where (cfn is not null) and (cfn != 'Unavailable') and (textract is null)""") # Make sure not unavailable cursor.execute(f"""Select rds_uid from {document_type} where (textract is null)""") # Make sure not unavailable records = cursor.fetchall() cursor.close()

parts = 15 record_batch = [] equal_value = int(len(records)/parts) for i in range(parts): if i == parts - 1: record_batch.append(records[equal_value * (i):]) continue record_batch.append(records[equal_value * i:equal_value * (i + 1)])

#print(record_batch) for i in record_batch: print(len(i))

textract_client.close() for num,list_of_records in enumerate(record_batch): #pass #Extract_Text#(list_of_records,database_connection,num) threading.Thread(target=Extract_Text,args=(list_of_records,database_connection,num,document_type,)).start() #if num == 1: #break

#done 0 - 12011117798101 #done 1 - 12011118146730


HERE IS THE SUPPORT TICKET RESPONSE TO ME TRYING TO UP MY SERVICE LIMIT: Hello,

This is Diego from AWS. I am checking in on your case as it has been a number of day since I last communicated.

Firstly, I want to thank you for your patience and cooperation regarding this limit increase request. The service team has finally taken all the information you have provided and have completed their consideration.

The team has granted a partial approval for the GetDocumentTextDetection API to 35 TPS.

They have also elected to keep the StartDocumentTextDetection at the limit of 15 TPS as they noticed you are not using more than 5 TPS for StartDocumentTextDetection; only getting throttled for GetDocumentTextDetection with max 6TPS.

We suggest you review the following documentation to see the best way to use a Get API is to get the results of the job processed after the job has been completed by Textract. This goal can be achieved by using a SNS topic and is detailed thoroughly here:

https://docs.aws.amazon.com/textract/latest/dg/async-analyzing-with-sqs.html .

If you continue to get throttled and this limit is still not sufficient, please let us know. We will be happy to reach back out to the team with your request. Also provide a detailed use case as to what your needs are and why the steps provided in the documentation do not best assess your needs.

That being said, if your issue is fully resolved, please feel free to resolve the case via the link below. Our system will resolve the case automatically should you choose not resolve it manually.

It is my hope that my service has met or gone beyond your expectations as our customer's satisfaction is my number one priority.

Thank you for contacting AWS Support.

Hope to hear from you soon!

We value your feedback. Please share your experience by rating this correspondence using the AWS Support Center link at the end of this correspondence. Each correspondence can also be rated by selecting the stars in top right corner of each correspondence within the AWS Support Center.

Best regards, Diego C. Amazon Web Services


HERE IS THE ERROR: An error occurred (ProvisionedThroughputExceededException) when calling the GetDocumentTextDetection operation (reached max retries: 4): Provisioned rate exceeded we are ok! In progress An error occurred (ProvisionedThroughputExceededException) when calling the GetDocumentTextDetection operation (reached max retries: 4): Provisioned rate exceeded we are ok! An error occurred (ProvisionedThroughputExceededException) when calling the GetDocumentTextDetection operation (reached max retries: 4): Provisioned rate exceeded we are ok! An error occurred (ProvisionedThroughputExceededException) when calling the GetDocumentTextDetection operation (reached max retries: 4): Provisioned rate exceeded we are ok!


I'm literally about to go back to tesseract and stop paying for an OCR.

1 Answer
0

Hello Bobby,

We wanted to let you know we removed some sensitive information from your post for your security, and that of your account. Please refrain from posting personal identifiable information in the future.

Thank you,

  • Ann D.
profile pictureAWS
EXPERT
answered 2 years ago

You are not logged in. Log in to post an answer.

A good answer clearly answers the question and provides constructive feedback and encourages professional growth in the question asker.

Guidelines for Answering Questions