I'm trying to extract tables asynchronously from a pdf document, using Textract with the Javascript V3 sdk. I successfully get a jobID from the StartDocumentAnalysisCommand, however when calling GetDocumentAnalysisCommand it appears that the data is shown in the response with blockType "PAGE" and "LINE", but there are no "Tables" blockTypes in the response returned.
When trying the demo in the console, it appears to extract the table data, so i'm not sure if there's an error in my code. Heres some of the code;
async function getJobResults(JobId) {
const pages = [];
let response;
let command0 = new GetDocumentAnalysisCommand({ JobId: JobId });
response = await AWS.send(command0).catch((err) => console.log(err));
pages.push(response);
console.log(`Received: ${pages.length}`);
let nextToken = response.NextToken || null;
if (nextToken) {
console.log(`Next token: ${nextToken}`);
}
while (nextToken) {
let command = new GetDocumentAnalysisCommand({
JobId: JobId, // required
// MaxResults: Number("int"),
NextToken: nextToken,
});
response = await AWS.send(command).catch((err) => console.log(err));
// .promise();
pages.push(response);
console.log(`Received: ${pages.length}`);
nextToken = response.NextToken || null;
if (nextToken) {
console.log(`Next token: ${nextToken}`);
}
}
return pages;
}
let waitTime = 0;
const getJob = async () => {
const { Messages } = await sqsClient
.send(
new ReceiveMessageCommand({
QueueUrl: SNSFunc.sqsQueueUrl,
MaxNumberOfMessages: 1,
})
)
.catch((err) => console.log(err));
if (Messages) {
console.log(`Message[0]: ${Messages[0].Body}`);
if (
JSON.parse(JSON.parse(Messages[0].Body).Message).Status ===
JobStatus.SUCCEEDED
) {
return await getJobResults(JobIDFunc);
} else {
const tick = 5000;
waitTime += tick;
console.log(
`Waited ${waitTime / 1000} seconds. No messages yet.`
);
setTimeout(getJob, tick);
return;
}
}
return await getJob();
};
//
async function getTableCsvResults(blocksV) {
// Get the text blocks
const blocks = blocksV && blocksV.map((el) => el.Blocks);
const blocksMap = {};
const tableBlocks = [];
blocks.map((el) => {
console.log(el);
blocksMap[el.Id] = el;
if (el.BlockType === "TABLE") {
tableBlocks.push(el);
}
});
if (tableBlocks.length <= 0) {
return "<b> NO Table FOUND </b>";
}
// code continues
}
const blocksVal = await getJob();
const tableCsv = await getTableCsvResults(blocksVal);
// RESPONSE IS "<b> NO Table FOUND </b>"
Yes, I pass the NextToken response to the GetDocumentAnalysisCommand, however i'm not sure if it's done correctly. You may view the code posted.